DSP库移植FFT记录
前言
由于项目的需要,需要FFT的算法对采集的中频信号进行处理,但是由于这次项目的使用的单片机的空间十分小,使用的是兆易创新发布的GD32E232系列的芯片,由于之前的项目都是使用ST系列的单片机。ST的单片机本身带有自身配套的汇编库,可以高效的实现FFT的功能,但是经过测试,发现使用GD32使用ST32的DSP库会报错,而进行移植和修改,估计会花费大量的时间(由于我对ARM的汇编不是很懂)。于是在项目的时间评估下,提出了两种方案:
- 编写FFT的C算法,测试其性能是否满足。
- 利用ARM自带的DSP库,测试其性能是否满足。
使用C语言实现算法测试效果
Operation Patform:GD32E232 MCU
FFT Point:32bit
Time Consuming:about 50ms
代码如下:
主要文件fft.c,fft.h,main.c三个文件
//fft.c
#include "fft.h"
#define SWAP(a,b) tempr=(a);(a)=(b);(b)=tempr
/*
@brief:Computing complex conjugate
@param[in] int n The number of complex
@param[in] complex in[] complex of exchange
@param[out] complex out[] complex of conjugate
*/
/*------------------------------------复数的共轭函数---------------------------------------------------*/
void conjugate_complex(int n, complex in[], complex out[])
{
int i = 0;
for (i = 0; i < n; i++)
{
out[i].imag = -in[i].imag;
out[i].real = in[i].real;
}
}
/*------------------------------------------------------------------------------------------------------*/
/*
@brief:Calculate the absolute value of complex number
@param[in] complex1[] complex number of input
@param[out] float out[] Absolute value of complex number
@param[out] int n number
*/
/*------------------------------------计算复数的模-----------------------------------------------------*/
void c_abs(complex complex1[], double out[], int n)
{
int i = 0;
float Temp;
for (i = 0; i < n; i++)
{
Temp = complex1[i].real * complex1[i].real + complex1[i].imag * complex1[i].imag;
out[i] = (float)sqrt(Temp);
}
}
/*------------------------------------------------------------------------------------------------------*/
/*
@brief:The sum of complex
@param[in] complex a complex number of input
@param[in] complex b complex number of input
@param[out] complex *c result
*/
/*------------------------------------计算复数的和------------------------------------------------------*/
void c_plus(complex a, complex b, complex *c)
{
c->real = a.real + b.real;
c->imag = a.imag + b.imag;
}
/*------------------------------------------------------------------------------------------------------*/
/*------------------------------------计算复数的差------------------------------------------------------*/
void c_sub(complex a, complex b, complex *c)
{
c->real = a.real - b.real;
c->imag = a.imag - b.imag;
}
/*------------------------------------------------------------------------------------------------------*/
/*------------------------------------计算复数的积------------------------------------------------------*/
void c_mul(complex a, complex b, complex *c)
{
c->real = a.real * b.real - a.imag * b.imag;
c->imag = a.real * b.imag + a.imag * b.real;
}
/*------------------------------------------------------------------------------------------------------*/
/*------------------------------------计算复数的商------------------------------------------------------*/
void c_div(complex a, complex b, complex *c)
{
c->real = (a.real * b.real + a.imag * b.imag) / (b.real * b.real + b.imag * b.imag);
c->imag = (a.imag * b.real - a.real * b.imag) / (b.real * b.real + b.imag * b.imag);
}
/*------------------------------------------------------------------------------------------------------*/
/*------------------------------------------------------------------------------------------------------*/
void Wn_i(int n, int i, complex *Wn, char flag)
{
Wn->real = cos(2 * PI*i / n);
if (flag == 1)
Wn->imag = -sin(2 * PI*i / n);
else if (flag == 0)
Wn->imag = -sin(2 * PI*i / n);
}
/*----------------------------------计算傅里叶变换--------------------------------------------------------*/
void fft(int N,complex f[])
{
complex t,wn;//中间变量
int i,j,k,m,n,l,r,M;
int la,lb,lc;
/*----计算分解的级数M=log2(N)----*/
for(i=N,M=1;(i=i/2)!=1;M++);
/*----按照倒位序重新排列原信号----*/
for(i=1,j=N/2;i<=N-2;i++)
{
if(i<j)
{
// t.real=f[j].real;
t = f[j];
f[j] = f[i];
f[i] = t;
}
k=N/2;
while(k<=j)
{
j=j-k;
k=k/2;
}
j=j+k;
}
/*----FFT算法----*/
for(m=1;m<=M;m++)
{
la=pow(2,m); //la=2^m代表第m级每个分组所含节点数
lb=la/2; //lb代表第m级每个分组所含碟形单元数
//同时它也表示每个碟形单元上下节点之间的距离
/*----碟形运算----*/
for(l=1;l<=lb;l++)
{
r=(l-1)*pow(2,M-m);
for(n=l-1;n<N-1;n=n+la) //遍历每个分组,分组总数为N/la
{
lc=n+lb; //n,lc分别代表一个碟形单元的上、下节点编号
Wn_i(N,r,&wn,1);//wn=Wnr
c_mul(f[lc],wn,&t);//t = f[lc] * wn复数运算
c_sub(f[n],t,&(f[lc]));//f[lc] = f[n] - f[lc] * Wnr
c_plus(f[n],t,&(f[n]));//f[n] = f[n] + f[lc] * Wnr
}
}
}
}
//傅里叶逆变换
void ifft(int N,complex f[])
{
int i=0;
conjugate_complex(N,f,f);
fft(N,f);
conjugate_complex(N,f,f);
for(i=0;i<N;i++)
{
f[i].imag = (f[i].imag)/N;
f[i].real = (f[i].real)/N;
}
}
/*------------------------------------------------------------------------------------------------------*/
//fft.h
#ifndef __FFT_H__
#define __FFT_H__
#include "math.h"
#define PI 3.1415926535897932384626433832795028841971
/*定义一个复数的结构*/
typedef struct
{
float real; //实部
float imag; //虚部
}complex;
/*------------------------------------函数列表-------------------------------------------------------*/
void conjugate_complex(int n, complex in[], complex out[]);
void c_abs(complex complex1[], double out[], int n);
void c_plus(complex a, complex b, complex *c);
void c_sub(complex a, complex b, complex *c);
void c_mul(complex a, complex b, complex *c);
void c_div(complex a, complex b, complex *c);
void Wn_i(int n, int i, complex *Wn, char flag);
void fft(int N, complex f[]);
#endif
//main.c
#include "stdio.h"
#include "fft.h"
#include "math.h"
#define sampleFrequency 32768 //采样频率
#define samplePoint 64 //采样点数
#define resolution (sampleFrequency/samplePoint) //FFT计算点数
#define LPFPoint 25 //低通的截止点数 resolution*LPFPoint为截止频率
#define AmplitudeThreshold 2 //幅度阈值 实际中需要,过滤噪声,量化噪声
complex BuffInArray[samplePoint] = { 0 };
double BuffOutArray[samplePoint/2] = { 0 };
double MagBuffOutArray[samplePoint / 2] = { 0 };
double LPFMagBuffOutArray[LPFPoint] = { 0 };
/*----------------------------生成测试数据-----------------------------------*/
void InitBuffInArray()
{
unsigned int i;
double test;
for (long i = 0; i < samplePoint; i++)
{
/*产生测试波形*/
test = 3 * sin(2 * PI * i * 700 / sampleFrequency) +
6 * sin(2* PI* i * 118 / sampleFrequency) +
4 * sin(2 * PI*i * 350 / sampleFrequency);
BuffInArray[i].real = test;
BuffInArray[i].imag = 0;
}
}
void GetPowerMag()
{
long int i;
double X, Y, P, Mag;
c_abs(BuffInArray, BuffOutArray, samplePoint/2);
for (i = 0; i < samplePoint/2; i++)
{
//X = BuffInArray[i].real / samplePoint;
//Y = BuffInArray[i].imag / samplePoint;
Mag = BuffOutArray[i] * 2 / samplePoint;
MagBuffOutArray[i] = Mag;
//P = atan2(Y, X) * 180 / PI;
printf("%d ",i);
printf("%d ", resolution*i);
printf("%lf ", Mag);
//printf("%f ", P);
//printf("%f ", X);
//printf("%f ", Y);
printf("\r\n");
}
}
/*----------------------------计算有效值-----------------------------------*/
void LowPassFilter()
{
for (long i = 0; i < LPFPoint; i++)
{
LPFMagBuffOutArray[i] = MagBuffOutArray[i];
printf("%d ", i);
printf("%d ", resolution*i);
printf("%lf ", MagBuffOutArray[i]);
printf("\r\n");
}
}
double GetEffectiveValue()
{
double SumMag = 0;
for (long i = 0; i < LPFPoint; i++)
{
if (LPFMagBuffOutArray[i] - AmplitudeThreshold <= 0.0000001 )
{
LPFMagBuffOutArray[i] = 0;
}
SumMag += LPFMagBuffOutArray[i];
}
return (sqrt(2)/2)*SumMag;
}
int main()
{
InitBuffInArray();
fft(samplePoint, BuffInArray);
printf("This is FFT Test \r\n");
printf("点数 频率 幅值 \r\n");
GetPowerMag();
printf("Low pass filtering results \r\n");
printf("点数 频率 幅值 \r\n");
LowPassFilter(); //低通滤波取出118HZ和350HZ
printf("Effective value %lf\r\n", GetEffectiveValue());
return 0;
}
测试后得到的数据是运行一次算法的时间是50ms,这显然太大,对于实时性要求比较高的场合,这个运行速度完全无法满足应用。在之前的测试中使用ST本身的汇编库的运算时间都是级别的。后期对代码进行了优化,但是优化测试的运行速度是30ms左右。
利用ARM自带的DSP库
在研究ARM自带的DSP库后,发现对于各种系列的内核都有支持,但是对于M23核没有支持,网上也没找到对于ARM算法库对于M23核的移植方法。
参考了【STM32H7的DSP教程】第6章 ARM DSP源码和库移植方法(MDK5的AC5和AC6)(https://blog.csdn.net/Simon223/article/details/105311500)这篇博文的方法,但是报出了几百个错误警告。
按照道理而言,M23核的指令集应该是兼容M0核指令集合,但是报错的进行调试,发现依然是汇编部分存在问题。我还是太菜。既然无法直接使用,于是我自己读懂代码后进行ARM库代码的移植。由于内存和空间的验证,加上之前FFT的测试经验。并且项目的精度要求考虑,使用了Q15型的定点FFT算法。移植代码如下。
代码由fft.h和fft.c两个文件主要组成。
Operation Patform:GD32E232 MCU
FFT Point:32bit
Time Consuming:about 390us
//fft.h
#ifndef __FFT_H__
#define __FFT_H__
#include "stdint.h"
/*--------------------------------------tyde definition---------------------------------------------------------*/
#define FFT32SamplePoint 32
#define FFT64SamplePoint 64
#define FFT128SamplePoint 128
#define FFT256SamplePoint 256
#define FFTPoint FFT32SamplePoint //set the sampling points of fft
#define ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH ((uint16_t)24)
#define ARMBITREVINDEXTABLE_FIXED___32_TABLE_LENGTH ((uint16_t)24)
#define ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH ((uint16_t)56)
#define ARMBITREVINDEXTABLE_FIXED___64_TABLE_LENGTH ((uint16_t)56)
/*-------------------------------------------------------------------------------------------------------------*/
/*--------------------------------------tyde definition---------------------------------------------------------*/
typedef int16_t q15_t;
typedef int32_t q31_t;
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
const q15_t *pTwiddle; /**< points to the Twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t bitRevLength; /**< bit reversal table length. */
} arm_cfft_instance_q15;
typedef struct
{
uint16_t fftLen; /**< length of the FFT. */
uint8_t ifftFlag; /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
uint8_t bitReverseFlag; /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
const q15_t *pTwiddle; /**< points to the twiddle factor table. */
const uint16_t *pBitRevTable; /**< points to the bit reversal table. */
uint16_t twidCoefModifier; /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
uint16_t bitRevFactor; /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
} arm_cfft_radix4_instance_q15;
/*-------------------------------------------------------------------------------------------------------------*/
/*-----------------------------------------constant table------------------------------------------------------*/
#if FFTPoint == FFT32SamplePoint
static const q15_t twiddleCoef_32_q15[48] = {
(q15_t)0x7FFF, (q15_t)0x0000,
(q15_t)0x7D8A, (q15_t)0x18F8,
(q15_t)0x7641, (q15_t)0x30FB,
(q15_t)0x6A6D, (q15_t)0x471C,
(q15_t)0x5A82, (q15_t)0x5A82,
(q15_t)0x471C, (q15_t)0x6A6D,
(q15_t)0x30FB, (q15_t)0x7641,
(q15_t)0x18F8, (q15_t)0x7D8A,
(q15_t)0x0000, (q15_t)0x7FFF,
(q15_t)0xE707, (q15_t)0x7D8A,
(q15_t)0xCF04, (q15_t)0x7641,
(q15_t)0xB8E3, (q15_t)0x6A6D,
(q15_t)0xA57D, (q15_t)0x5A82,
(q15_t)0x9592, (q15_t)0x471C,
(q15_t)0x89BE, (q15_t)0x30FB,
(q15_t)0x8275, (q15_t)0x18F8,
(q15_t)0x8000, (q15_t)0x0000,
(q15_t)0x8275, (q15_t)0xE707,
(q15_t)0x89BE, (q15_t)0xCF04,
(q15_t)0x9592, (q15_t)0xB8E3,
(q15_t)0xA57D, (q15_t)0xA57D,
(q15_t)0xB8E3, (q15_t)0x9592,
(q15_t)0xCF04, (q15_t)0x89BE,
(q15_t)0xE707, (q15_t)0x8275
};
#endif
#if FFTPoint == FFT64SamplePoint
static const q15_t twiddleCoef_64_q15[96] = {
(q15_t)0x7FFF, (q15_t)0x0000, (q15_t)0x7F62, (q15_t)0x0C8B,
(q15_t)0x7D8A, (q15_t)0x18F8, (q15_t)0x7A7D, (q15_t)0x2528,
(q15_t)0x7641, (q15_t)0x30FB, (q15_t)0x70E2, (q15_t)0x3C56,
(q15_t)0x6A6D, (q15_t)0x471C, (q15_t)0x62F2, (q15_t)0x5133,
(q15_t)0x5A82, (q15_t)0x5A82, (q15_t)0x5133, (q15_t)0x62F2,
(q15_t)0x471C, (q15_t)0x6A6D, (q15_t)0x3C56, (q15_t)0x70E2,
(q15_t)0x30FB, (q15_t)0x7641, (q15_t)0x2528, (q15_t)0x7A7D,
(q15_t)0x18F8, (q15_t)0x7D8A, (q15_t)0x0C8B, (q15_t)0x7F62,
(q15_t)0x0000, (q15_t)0x7FFF, (q15_t)0xF374, (q15_t)0x7F62,
(q15_t)0xE707, (q15_t)0x7D8A, (q15_t)0xDAD7, (q15_t)0x7A7D,
(q15_t)0xCF04, (q15_t)0x7641, (q15_t)0xC3A9, (q15_t)0x70E2,
(q15_t)0xB8E3, (q15_t)0x6A6D, (q15_t)0xAECC, (q15_t)0x62F2,
(q15_t)0xA57D, (q15_t)0x5A82, (q15_t)0x9D0D, (q15_t)0x5133,
(q15_t)0x9592, (q15_t)0x471C, (q15_t)0x8F1D, (q15_t)0x3C56,
(q15_t)0x89BE, (q15_t)0x30FB, (q15_t)0x8582, (q15_t)0x2528,
(q15_t)0x8275, (q15_t)0x18F8, (q15_t)0x809D, (q15_t)0x0C8B,
(q15_t)0x8000, (q15_t)0x0000, (q15_t)0x809D, (q15_t)0xF374,
(q15_t)0x8275, (q15_t)0xE707, (q15_t)0x8582, (q15_t)0xDAD7,
(q15_t)0x89BE, (q15_t)0xCF04, (q15_t)0x8F1D, (q15_t)0xC3A9,
(q15_t)0x9592, (q15_t)0xB8E3, (q15_t)0x9D0D, (q15_t)0xAECC,
(q15_t)0xA57D, (q15_t)0xA57D, (q15_t)0xAECC, (q15_t)0x9D0D,
(q15_t)0xB8E3, (q15_t)0x9592, (q15_t)0xC3A9, (q15_t)0x8F1D,
(q15_t)0xCF04, (q15_t)0x89BE, (q15_t)0xDAD7, (q15_t)0x8582,
(q15_t)0xE707, (q15_t)0x8275, (q15_t)0xF374, (q15_t)0x809D
};
#endif
#if FFTPoint == 32
static const uint16_t armBitRevIndexTable_fixed_32[ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH] =
{
/* 4x2, size 24 */
8,128, 16,64, 24,192, 40,160, 48,96, 56,224, 72,144,
88,208, 104,176, 120,240, 152,200, 184,232
};
#endif
#if FFTPoint == FFT64SamplePoint
static const uint16_t armBitRevIndexTable_fixed_64[ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH] =
{
/* radix 4, size 56 */
8,256, 16,128, 24,384, 32,64, 40,320, 48,192, 56,448, 72,288, 80,160, 88,416, 104,352,
112,224, 120,480, 136,272, 152,400, 168,336, 176,208, 184,464, 200,304, 216,432,
232,368, 248,496, 280,392, 296,328, 312,456, 344,424, 376,488, 440,472
};
#endif
/*-----------------------------------------------------------------------------------------------------------------*/
/*-----------------------------------function list------------------------------------------------------------------*/
void arm_cfft_q15(
const arm_cfft_instance_q15 * S,
q15_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag);
void arm_radix4_butterfly_q15(
q15_t * pSrc16,
uint32_t fftLen,
const q15_t * pCoef16,
uint32_t twidCoefModifier);
void arm_radix4_butterfly_inverse_q15(
q15_t * pSrc16,
uint32_t fftLen,
const q15_t * pCoef16,
uint32_t twidCoefModifier);
void arm_bitreversal_q15(
q15_t * pSrc,
uint32_t fftLen,
uint16_t bitRevFactor,
const uint16_t * pBitRevTab);
#endif
//fft.c
#include "fft.h"
/* ----------------------------------------------------------------------
* Project: FFT
* Title: arm_cfft_radix4_q15.c
* Description: This file has function definition of Radix-4 FFT & IFFT function and
* In-place bit reversal using bit reversal table
*
* $Revision: V1.6.0
*
* Target Processor: Cortex-M cores
* -------------------------------------------------------------------- */
void arm_bitreversal_16(
uint16_t *pSrc,
const uint16_t bitRevLen,
const uint16_t *pBitRevTab)
{
uint16_t a, b, i, tmp;
for (i = 0; i < bitRevLen; )
{
a = pBitRevTab[i ] >> 2;
b = pBitRevTab[i + 1] >> 2;
//real
tmp = pSrc[a];
pSrc[a] = pSrc[b];
pSrc[b] = tmp;
//complex
tmp = pSrc[a+1];
pSrc[a+1] = pSrc[b+1];
pSrc[b+1] = tmp;
i += 2;
}
}
void arm_bitreversal_q15(
q15_t * pSrc16,
uint32_t fftLen,
uint16_t bitRevFactor,
const uint16_t * pBitRevTab)
{
q31_t *pSrc = (q31_t *) pSrc16;
q31_t in;
uint32_t fftLenBy2, fftLenBy2p1;
uint32_t i, j;
/* Initializations */
j = 0U;
fftLenBy2 = fftLen / 2U;
fftLenBy2p1 = (fftLen / 2U) + 1U;
/* Bit Reversal Implementation */
for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
{
if (i < j)
{
/* pSrc[i] <-> pSrc[j]; */
/* pSrc[i+1U] <-> pSrc[j+1U] */
in = pSrc[i];
pSrc[i] = pSrc[j];
pSrc[j] = in;
/* pSrc[i + fftLenBy2p1] <-> pSrc[j + fftLenBy2p1]; */
/* pSrc[i + fftLenBy2p1+1U] <-> pSrc[j + fftLenBy2p1+1U] */
in = pSrc[i + fftLenBy2p1];
pSrc[i + fftLenBy2p1] = pSrc[j + fftLenBy2p1];
pSrc[j + fftLenBy2p1] = in;
}
/* pSrc[i+1U] <-> pSrc[j+fftLenBy2]; */
/* pSrc[i+2] <-> pSrc[j+fftLenBy2+1U] */
in = pSrc[i + 1U];
pSrc[i + 1U] = pSrc[j + fftLenBy2];
pSrc[j + fftLenBy2] = in;
/* Reading the index for the bit reversal */
j = *pBitRevTab;
/* Updating the bit reversal index depending on the fft length */
pBitRevTab += bitRevFactor;
}
}
int32_t __SSAT(int32_t val, uint32_t sat)
{
if ((sat >= 1U) && (sat <= 32U))
{
const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
const int32_t min = -1 - max ;
if (val > max)
{
return max;
}
else if (val < min)
{
return min;
}
}
return val;
}
void arm_cfft_radix4_q15(
const arm_cfft_radix4_instance_q15 * S,
q15_t * pSrc)
{
if (S->ifftFlag == 1U)
{
/* Complex IFFT radix-4 */
arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
}
else
{
/* Complex FFT radix-4 */
arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
}
if (S->bitReverseFlag == 1U)
{
/* Bit Reversal */
arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
}
}
void arm_radix4_butterfly_q15(
q15_t * pSrc16,
uint32_t fftLen,
const q15_t * pCoef16,
uint32_t twidCoefModifier)
{
#if defined (ARM_MATH_DSP)
q31_t R, S, T, U;
q31_t C1, C2, C3, out1, out2;
uint32_t n1, n2, ic, i0, j, k;
q15_t *ptr1;
q15_t *pSi0;
q15_t *pSi1;
q15_t *pSi2;
q15_t *pSi3;
q31_t xaya, xbyb, xcyc, xdyd;
/* Total process is divided into three stages */
/* process first stage, middle stages, & last stage */
/* Initializations for the first stage */
n2 = fftLen;
n1 = n2;
/* n2 = fftLen/4 */
n2 >>= 2U;
/* Index for twiddle coefficient */
ic = 0U;
/* Index for input read and output write */
j = n2;
pSi0 = pSrc16;
pSi1 = pSi0 + 2 * n2;
pSi2 = pSi1 + 2 * n2;
pSi3 = pSi2 + 2 * n2;
/* Input is in 1.15(q15) format */
/* start of first stage process */
do
{
/* Butterfly implementation */
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
T = read_q15x2 (pSi0);
T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */
/*
in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
*/
/* Read yc (real), xc(imag) input */
S = read_q15x2 (pSi2);
S = __SHADD16(S, 0);
S = __SHADD16(S, 0);
/* R = packed((ya + yc), (xa + xc) ) */
R = __QADD16(T, S);
/* S = packed((ya - yc), (xa - xc) ) */
S = __QSUB16(T, S);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
T = read_q15x2 (pSi1);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* Read yd (real), xd(imag) input */
U = read_q15x2 (pSi3);
U = __SHADD16(U, 0);
U = __SHADD16(U, 0);
/* T = packed((yb + yd), (xb + xd) ) */
T = __QADD16(T, U);
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
write_q15x2_ia (&pSi0, __SHADD16(R, T));
/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
R = __QSUB16(R, T);
/* co2 & si2 are read from SIMD Coefficient pointer */
C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
#ifndef ARM_MATH_BIG_ENDIAN
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
out1 = __SMUAD(C2, R) >> 16U;
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = __SMUSDX(C2, R);
#else
/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out1 = __SMUSDX(R, C2) >> 16U;
/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
out2 = __SMUAD(C2, R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Reading i0+fftLen/4 */
/* T = packed(yb, xb) */
T = read_q15x2 (pSi1);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* writing the butterfly processed i0 + fftLen/4 sample */
/* writing output(xc', yc') in little endian format */
write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
/* Butterfly calculations */
/* U = packed(yd, xd) */
U = read_q15x2 (pSi3);
U = __SHADD16(U, 0);
U = __SHADD16(U, 0);
/* T = packed(yb-yd, xb-xd) */
T = __QSUB16(T, U);
#ifndef ARM_MATH_BIG_ENDIAN
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __QASX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __QSAX(S, T);
#else
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __QSAX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __QASX(S, T);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* co1 & si1 are read from SIMD Coefficient pointer */
C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
/* Butterfly process for the i0+fftLen/2 sample */
#ifndef ARM_MATH_BIG_ENDIAN
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
out1 = __SMUAD(C1, S) >> 16U;
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
out2 = __SMUSDX(C1, S);
#else
/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
out1 = __SMUSDX(S, C1) >> 16U;
/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
out2 = __SMUAD(C1, S);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xb', yb') in little endian format */
write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
/* co3 & si3 are read from SIMD Coefficient pointer */
C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
/* Butterfly process for the i0+3fftLen/4 sample */
#ifndef ARM_MATH_BIG_ENDIAN
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
out1 = __SMUAD(C3, R) >> 16U;
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
out2 = __SMUSDX(C3, R);
#else
/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
out1 = __SMUSDX(R, C3) >> 16U;
/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
out2 = __SMUAD(C3, R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xd', yd') in little endian format */
write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
} while (--j);
/* data is in 4.11(q11) format */
/* end of first stage process */
/* start of middle stage process */
/* Twiddle coefficients index modifier */
twidCoefModifier <<= 2U;
/* Calculation of Middle stage */
for (k = fftLen / 4U; k > 4U; k >>= 2U)
{
/* Initializations for the middle stage */
n1 = n2;
n2 >>= 2U;
ic = 0U;
for (j = 0U; j <= (n2 - 1U); j++)
{
/* index calculation for the coefficients */
C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
pSi0 = pSrc16 + 2 * j;
pSi1 = pSi0 + 2 * n2;
pSi2 = pSi1 + 2 * n2;
pSi3 = pSi2 + 2 * n2;
/* Butterfly implementation */
for (i0 = j; i0 < fftLen; i0 += n1)
{
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
T = read_q15x2 (pSi0);
/* Read yc (real), xc(imag) input */
S = read_q15x2 (pSi2);
/* R = packed( (ya + yc), (xa + xc)) */
R = __QADD16(T, S);
/* S = packed((ya - yc), (xa - xc)) */
S = __QSUB16(T, S);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
T = read_q15x2 (pSi1);
/* Read yd (real), xd(imag) input */
U = read_q15x2 (pSi3);
/* T = packed( (yb + yd), (xb + xd)) */
T = __QADD16(T, U);
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
out1 = __SHADD16(R, T);
out1 = __SHADD16(out1, 0);
write_q15x2 (pSi0, out1);
pSi0 += 2 * n1;
/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
R = __SHSUB16(R, T);
#ifndef ARM_MATH_BIG_ENDIAN
/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
out1 = __SMUAD(C2, R) >> 16U;
/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = __SMUSDX(C2, R);
#else
/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out1 = __SMUSDX(R, C2) >> 16U;
/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
out2 = __SMUAD(C2, R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Reading i0+3fftLen/4 */
/* Read yb (real), xb(imag) input */
T = read_q15x2 (pSi1);
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi1 += 2 * n1;
/* Butterfly calculations */
/* Read yd (real), xd(imag) input */
U = read_q15x2 (pSi3);
/* T = packed(yb-yd, xb-xd) */
T = __QSUB16(T, U);
#ifndef ARM_MATH_BIG_ENDIAN
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __SHASX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __SHSAX(S, T);
/* Butterfly process for the i0+fftLen/2 sample */
out1 = __SMUAD(C1, S) >> 16U;
out2 = __SMUSDX(C1, S);
#else
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __SHSAX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __SHASX(S, T);
/* Butterfly process for the i0+fftLen/2 sample */
out1 = __SMUSDX(S, C1) >> 16U;
out2 = __SMUAD(C1, S);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi2 += 2 * n1;
/* Butterfly process for the i0+3fftLen/4 sample */
#ifndef ARM_MATH_BIG_ENDIAN
out1 = __SMUAD(C3, R) >> 16U;
out2 = __SMUSDX(C3, R);
#else
out1 = __SMUSDX(R, C3) >> 16U;
out2 = __SMUAD(C3, R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi3 += 2 * n1;
}
}
/* Twiddle coefficients index modifier */
twidCoefModifier <<= 2U;
}
/* end of middle stage process */
/* data is in 10.6(q6) format for the 1024 point */
/* data is in 8.8(q8) format for the 256 point */
/* data is in 6.10(q10) format for the 64 point */
/* data is in 4.12(q12) format for the 16 point */
/* Initializations for the last stage */
j = fftLen >> 2;
ptr1 = &pSrc16[0];
/* start of last stage process */
/* Butterfly implementation */
do
{
/* Read xa (real), ya(imag) input */
xaya = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xb (real), yb(imag) input */
xbyb = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xc (real), yc(imag) input */
xcyc = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xd (real), yd(imag) input */
xdyd = read_q15x2_ia ((q15_t **) &ptr1);
/* R = packed((ya + yc), (xa + xc)) */
R = __QADD16(xaya, xcyc);
/* T = packed((yb + yd), (xb + xd)) */
T = __QADD16(xbyb, xdyd);
/* pointer updation for writing */
ptr1 = ptr1 - 8U;
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
write_q15x2_ia (&ptr1, __SHADD16(R, T));
/* T = packed((yb + yd), (xb + xd)) */
T = __QADD16(xbyb, xdyd);
/* xc' = (xa-xb+xc-xd) */
/* yc' = (ya-yb+yc-yd) */
write_q15x2_ia (&ptr1, __SHSUB16(R, T));
/* S = packed((ya - yc), (xa - xc)) */
S = __QSUB16(xaya, xcyc);
/* Read yd (real), xd(imag) input */
/* T = packed( (yb - yd), (xb - xd)) */
U = __QSUB16(xbyb, xdyd);
#ifndef ARM_MATH_BIG_ENDIAN
/* xb' = (xa+yb-xc-yd) */
/* yb' = (ya-xb-yc+xd) */
write_q15x2_ia (&ptr1, __SHSAX(S, U));
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
write_q15x2_ia (&ptr1, __SHASX(S, U));
#else
/* xb' = (xa+yb-xc-yd) */
/* yb' = (ya-xb-yc+xd) */
write_q15x2_ia (&ptr1, __SHASX(S, U));
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
write_q15x2_ia (&ptr1, __SHSAX(S, U));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
} while (--j);
/* end of last stage process */
/* output is in 11.5(q5) format for the 1024 point */
/* output is in 9.7(q7) format for the 256 point */
/* output is in 7.9(q9) format for the 64 point */
/* output is in 5.11(q11) format for the 16 point */
#else /* #if defined (ARM_MATH_DSP) */
q15_t R0, R1, S0, S1, T0, T1, U0, U1;
q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
/* Total process is divided into three stages */
/* process first stage, middle stages, & last stage */
/* Initializations for the first stage */
n2 = fftLen;
n1 = n2;
/* n2 = fftLen/4 */
n2 >>= 2U;
/* Index for twiddle coefficient */
ic = 0U;
/* Index for input read and output write */
i0 = 0U;
j = n2;
/* Input is in 1.15(q15) format */
/* start of first stage process */
do
{
/* Butterfly implementation */
/* index calculation for the input as, */
/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
i1 = i0 + n2;
i2 = i1 + n2;
i3 = i2 + n2;
/* Reading i0, i0+fftLen/2 inputs */
/* input is down scale by 4 to avoid overflow */
/* Read ya (real), xa(imag) input */
T0 = pSrc16[i0 * 2U] >> 2U;
T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
/* input is down scale by 4 to avoid overflow */
/* Read yc (real), xc(imag) input */
S0 = pSrc16[i2 * 2U] >> 2U;
S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
/* R0 = (ya + yc) */
R0 = __SSAT(T0 + S0, 16U);
/* R1 = (xa + xc) */
R1 = __SSAT(T1 + S1, 16U);
/* S0 = (ya - yc) */
S0 = __SSAT(T0 - S0, 16);
/* S1 = (xa - xc) */
S1 = __SSAT(T1 - S1, 16);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* input is down scale by 4 to avoid overflow */
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U] >> 2U;
T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
/* input is down scale by 4 to avoid overflow */
/* Read yd (real), xd(imag) input */
U0 = pSrc16[i3 * 2U] >> 2U;
U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
/* T0 = (yb + yd) */
T0 = __SSAT(T0 + U0, 16U);
/* T1 = (xb + xd) */
T1 = __SSAT(T1 + U1, 16U);
/* writing the butterfly processed i0 sample */
/* ya' = ya + yb + yc + yd */
/* xa' = xa + xb + xc + xd */
pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
/* R0 = (ya + yc) - (yb + yd) */
/* R1 = (xa + xc) - (xb + xd) */
R0 = __SSAT(R0 - T0, 16U);
R1 = __SSAT(R1 - T1, 16U);
/* co2 & si2 are read from Coefficient pointer */
Co2 = pCoef16[2U * ic * 2U];
Si2 = pCoef16[(2U * ic * 2U) + 1];
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
/* Reading i0+fftLen/4 */
/* input is down scale by 4 to avoid overflow */
/* T0 = yb, T1 = xb */
T0 = pSrc16[i1 * 2U] >> 2;
T1 = pSrc16[(i1 * 2U) + 1] >> 2;
/* writing the butterfly processed i0 + fftLen/4 sample */
/* writing output(xc', yc') in little endian format */
pSrc16[i1 * 2U] = out1;
pSrc16[(i1 * 2U) + 1] = out2;
/* Butterfly calculations */
/* input is down scale by 4 to avoid overflow */
/* U0 = yd, U1 = xd */
U0 = pSrc16[i3 * 2U] >> 2;
U1 = pSrc16[(i3 * 2U) + 1] >> 2;
/* T0 = yb-yd */
T0 = __SSAT(T0 - U0, 16);
/* T1 = xb-xd */
T1 = __SSAT(T1 - U1, 16);
/* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */
R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
/* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
/* co1 & si1 are read from Coefficient pointer */
Co1 = pCoef16[ic * 2U];
Si1 = pCoef16[(ic * 2U) + 1];
/* Butterfly process for the i0+fftLen/2 sample */
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
/* writing output(xb', yb') in little endian format */
pSrc16[i2 * 2U] = out1;
pSrc16[(i2 * 2U) + 1] = out2;
/* Co3 & si3 are read from Coefficient pointer */
Co3 = pCoef16[3U * (ic * 2U)];
Si3 = pCoef16[(3U * (ic * 2U)) + 1];
/* Butterfly process for the i0+3fftLen/4 sample */
/* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
/* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
/* writing output(xd', yd') in little endian format */
pSrc16[i3 * 2U] = out1;
pSrc16[(i3 * 2U) + 1] = out2;
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
/* Updating input index */
i0 = i0 + 1U;
} while (--j);
/* data is in 4.11(q11) format */
/* end of first stage process */
/* start of middle stage process */
/* Twiddle coefficients index modifier */
twidCoefModifier <<= 2U;
/* Calculation of Middle stage */
for (k = fftLen / 4U; k > 4U; k >>= 2U)
{
/* Initializations for the middle stage */
n1 = n2;
n2 >>= 2U;
ic = 0U;
for (j = 0U; j <= (n2 - 1U); j++)
{
/* index calculation for the coefficients */
Co1 = pCoef16[ic * 2U];
Si1 = pCoef16[(ic * 2U) + 1U];
Co2 = pCoef16[2U * (ic * 2U)];
Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
Co3 = pCoef16[3U * (ic * 2U)];
Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
/* Butterfly implementation */
for (i0 = j; i0 < fftLen; i0 += n1)
{
/* index calculation for the input as, */
/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
i1 = i0 + n2;
i2 = i1 + n2;
i3 = i2 + n2;
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
T0 = pSrc16[i0 * 2U];
T1 = pSrc16[(i0 * 2U) + 1U];
/* Read yc (real), xc(imag) input */
S0 = pSrc16[i2 * 2U];
S1 = pSrc16[(i2 * 2U) + 1U];
/* R0 = (ya + yc), R1 = (xa + xc) */
R0 = __SSAT(T0 + S0, 16);
R1 = __SSAT(T1 + S1, 16);
/* S0 = (ya - yc), S1 =(xa - xc) */
S0 = __SSAT(T0 - S0, 16);
S1 = __SSAT(T1 - S1, 16);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U];
T1 = pSrc16[(i1 * 2U) + 1U];
/* Read yd (real), xd(imag) input */
U0 = pSrc16[i3 * 2U];
U1 = pSrc16[(i3 * 2U) + 1U];
/* T0 = (yb + yd), T1 = (xb + xd) */
T0 = __SSAT(T0 + U0, 16);
T1 = __SSAT(T1 + U1, 16);
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
pSrc16[i0 * 2U] = out1;
pSrc16[(2U * i0) + 1U] = out2;
/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
R0 = (R0 >> 1U) - (T0 >> 1U);
R1 = (R1 >> 1U) - (T1 >> 1U);
/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
/* Reading i0+3fftLen/4 */
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U];
T1 = pSrc16[(i1 * 2U) + 1U];
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
pSrc16[i1 * 2U] = out1;
pSrc16[(i1 * 2U) + 1U] = out2;
/* Butterfly calculations */
/* Read yd (real), xd(imag) input */
U0 = pSrc16[i3 * 2U];
U1 = pSrc16[(i3 * 2U) + 1U];
/* T0 = yb-yd, T1 = xb-xd */
T0 = __SSAT(T0 - U0, 16);
T1 = __SSAT(T1 - U1, 16);
/* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
R0 = (S0 >> 1U) - (T1 >> 1U);
R1 = (S1 >> 1U) + (T0 >> 1U);
/* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
S0 = (S0 >> 1U) + (T1 >> 1U);
S1 = (S1 >> 1U) - (T0 >> 1U);
/* Butterfly process for the i0+fftLen/2 sample */
out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
pSrc16[i2 * 2U] = out1;
pSrc16[(i2 * 2U) + 1U] = out2;
/* Butterfly process for the i0+3fftLen/4 sample */
out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
/* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
/* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
pSrc16[i3 * 2U] = out1;
pSrc16[(i3 * 2U) + 1U] = out2;
}
}
/* Twiddle coefficients index modifier */
twidCoefModifier <<= 2U;
}
/* end of middle stage process */
/* data is in 10.6(q6) format for the 1024 point */
/* data is in 8.8(q8) format for the 256 point */
/* data is in 6.10(q10) format for the 64 point */
/* data is in 4.12(q12) format for the 16 point */
/* Initializations for the last stage */
n1 = n2;
n2 >>= 2U;
/* start of last stage process */
/* Butterfly implementation */
for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
{
/* index calculation for the input as, */
/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
i1 = i0 + n2;
i2 = i1 + n2;
i3 = i2 + n2;
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
T0 = pSrc16[i0 * 2U];
T1 = pSrc16[(i0 * 2U) + 1U];
/* Read yc (real), xc(imag) input */
S0 = pSrc16[i2 * 2U];
S1 = pSrc16[(i2 * 2U) + 1U];
/* R0 = (ya + yc), R1 = (xa + xc) */
R0 = __SSAT(T0 + S0, 16U);
R1 = __SSAT(T1 + S1, 16U);
/* S0 = (ya - yc), S1 = (xa - xc) */
S0 = __SSAT(T0 - S0, 16U);
S1 = __SSAT(T1 - S1, 16U);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U];
T1 = pSrc16[(i1 * 2U) + 1U];
/* Read yd (real), xd(imag) input */
U0 = pSrc16[i3 * 2U];
U1 = pSrc16[(i3 * 2U) + 1U];
/* T0 = (yb + yd), T1 = (xb + xd)) */
T0 = __SSAT(T0 + U0, 16U);
T1 = __SSAT(T1 + U1, 16U);
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
R0 = (R0 >> 1U) - (T0 >> 1U);
R1 = (R1 >> 1U) - (T1 >> 1U);
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U];
T1 = pSrc16[(i1 * 2U) + 1U];
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd) */
/* yc' = (ya-yb+yc-yd) */
pSrc16[i1 * 2U] = R0;
pSrc16[(i1 * 2U) + 1U] = R1;
/* Read yd (real), xd(imag) input */
U0 = pSrc16[i3 * 2U];
U1 = pSrc16[(i3 * 2U) + 1U];
/* T0 = (yb - yd), T1 = (xb - xd) */
T0 = __SSAT(T0 - U0, 16U);
T1 = __SSAT(T1 - U1, 16U);
/* writing the butterfly processed i0 + fftLen/2 sample */
/* xb' = (xa+yb-xc-yd) */
/* yb' = (ya-xb-yc+xd) */
pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
/* writing the butterfly processed i0 + 3fftLen/4 sample */
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
}
/* end of last stage process */
/* output is in 11.5(q5) format for the 1024 point */
/* output is in 9.7(q7) format for the 256 point */
/* output is in 7.9(q9) format for the 64 point */
/* output is in 5.11(q11) format for the 16 point */
#endif /* #if defined (ARM_MATH_DSP) */
}
/**
@brief Core function for the Q15 CIFFT butterfly process.
@param[in,out] pSrc16 points to the in-place buffer of Q15 data type
@param[in] fftLen length of the FFT
@param[in] pCoef16 points to twiddle coefficient buffer
@param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
@return none
*/
/*
* Radix-4 IFFT algorithm used is :
*
* CIFFT uses same twiddle coefficients as CFFT function
* x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
*
*
* IFFT is implemented with following changes in equations from FFT
*
* Input real and imaginary data:
* x(n) = xa + j * ya
* x(n+N/4 ) = xb + j * yb
* x(n+N/2 ) = xc + j * yc
* x(n+3N 4) = xd + j * yd
*
*
* Output real and imaginary data:
* x(4r) = xa'+ j * ya'
* x(4r+1) = xb'+ j * yb'
* x(4r+2) = xc'+ j * yc'
* x(4r+3) = xd'+ j * yd'
*
*
* Twiddle factors for radix-4 IFFT:
* Wn = co1 + j * (si1)
* W2n = co2 + j * (si2)
* W3n = co3 + j * (si3)
* The real and imaginary output values for the radix-4 butterfly are
* xa' = xa + xb + xc + xd
* ya' = ya + yb + yc + yd
* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
* xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
* yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
*
*/
void arm_radix4_butterfly_inverse_q15(
q15_t * pSrc16,
uint32_t fftLen,
const q15_t * pCoef16,
uint32_t twidCoefModifier)
{
#if defined (ARM_MATH_DSP)
q31_t R, S, T, U;
q31_t C1, C2, C3, out1, out2;
uint32_t n1, n2, ic, i0, j, k;
q15_t *ptr1;
q15_t *pSi0;
q15_t *pSi1;
q15_t *pSi2;
q15_t *pSi3;
q31_t xaya, xbyb, xcyc, xdyd;
/* Total process is divided into three stages */
/* process first stage, middle stages, & last stage */
/* Initializations for the first stage */
n2 = fftLen;
n1 = n2;
/* n2 = fftLen/4 */
n2 >>= 2U;
/* Index for twiddle coefficient */
ic = 0U;
/* Index for input read and output write */
j = n2;
pSi0 = pSrc16;
pSi1 = pSi0 + 2 * n2;
pSi2 = pSi1 + 2 * n2;
pSi3 = pSi2 + 2 * n2;
/* Input is in 1.15(q15) format */
/* start of first stage process */
do
{
/* Butterfly implementation */
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
T = read_q15x2 (pSi0);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* Read yc (real), xc(imag) input */
S = read_q15x2 (pSi2);
S = __SHADD16(S, 0);
S = __SHADD16(S, 0);
/* R = packed((ya + yc), (xa + xc) ) */
R = __QADD16(T, S);
/* S = packed((ya - yc), (xa - xc) ) */
S = __QSUB16(T, S);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
T = read_q15x2 (pSi1);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* Read yd (real), xd(imag) input */
U = read_q15x2 (pSi3);
U = __SHADD16(U, 0);
U = __SHADD16(U, 0);
/* T = packed((yb + yd), (xb + xd) ) */
T = __QADD16(T, U);
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
write_q15x2_ia (&pSi0, __SHADD16(R, T));
/* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
R = __QSUB16(R, T);
/* co2 & si2 are read from SIMD Coefficient pointer */
C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
#ifndef ARM_MATH_BIG_ENDIAN
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
out1 = __SMUSD(C2, R) >> 16U;
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = __SMUADX(C2, R);
#else
/* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out1 = __SMUADX(C2, R) >> 16U;
/* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
out2 = __SMUSD(__QSUB16(0, C2), R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Reading i0+fftLen/4 */
/* T = packed(yb, xb) */
T = read_q15x2 (pSi1);
T = __SHADD16(T, 0);
T = __SHADD16(T, 0);
/* writing the butterfly processed i0 + fftLen/4 sample */
/* writing output(xc', yc') in little endian format */
write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
/* Butterfly calculations */
/* U = packed(yd, xd) */
U = read_q15x2 (pSi3);
U = __SHADD16(U, 0);
U = __SHADD16(U, 0);
/* T = packed(yb-yd, xb-xd) */
T = __QSUB16(T, U);
#ifndef ARM_MATH_BIG_ENDIAN
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __QSAX(S, T);
/* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */
S = __QASX(S, T);
#else
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __QASX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __QSAX(S, T);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* co1 & si1 are read from SIMD Coefficient pointer */
C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
/* Butterfly process for the i0+fftLen/2 sample */
#ifndef ARM_MATH_BIG_ENDIAN
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
out1 = __SMUSD(C1, S) >> 16U;
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
out2 = __SMUADX(C1, S);
#else
/* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
out1 = __SMUADX(C1, S) >> 16U;
/* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
out2 = __SMUSD(__QSUB16(0, C1), S);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xb', yb') in little endian format */
write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));
/* co3 & si3 are read from SIMD Coefficient pointer */
C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
/* Butterfly process for the i0+3fftLen/4 sample */
#ifndef ARM_MATH_BIG_ENDIAN
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
out1 = __SMUSD(C3, R) >> 16U;
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
out2 = __SMUADX(C3, R);
#else
/* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
out1 = __SMUADX(C3, R) >> 16U;
/* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
out2 = __SMUSD(__QSUB16(0, C3), R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* writing output(xd', yd') in little endian format */
write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
} while (--j);
/* data is in 4.11(q11) format */
/* end of first stage process */
/* start of middle stage process */
/* Twiddle coefficients index modifier */
twidCoefModifier <<= 2U;
/* Calculation of Middle stage */
for (k = fftLen / 4U; k > 4U; k >>= 2U)
{
/* Initializations for the middle stage */
n1 = n2;
n2 >>= 2U;
ic = 0U;
for (j = 0U; j <= (n2 - 1U); j++)
{
/* index calculation for the coefficients */
C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
pSi0 = pSrc16 + 2 * j;
pSi1 = pSi0 + 2 * n2;
pSi2 = pSi1 + 2 * n2;
pSi3 = pSi2 + 2 * n2;
/* Butterfly implementation */
for (i0 = j; i0 < fftLen; i0 += n1)
{
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
T = read_q15x2 (pSi0);
/* Read yc (real), xc(imag) input */
S = read_q15x2 (pSi2);
/* R = packed( (ya + yc), (xa + xc)) */
R = __QADD16(T, S);
/* S = packed((ya - yc), (xa - xc)) */
S = __QSUB16(T, S);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
T = read_q15x2 (pSi1);
/* Read yd (real), xd(imag) input */
U = read_q15x2 (pSi3);
/* T = packed( (yb + yd), (xb + xd)) */
T = __QADD16(T, U);
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
out1 = __SHADD16(R, T);
out1 = __SHADD16(out1, 0);
write_q15x2 (pSi0, out1);
pSi0 += 2 * n1;
/* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
R = __SHSUB16(R, T);
#ifndef ARM_MATH_BIG_ENDIAN
/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
out1 = __SMUSD(C2, R) >> 16U;
/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out2 = __SMUADX(C2, R);
#else
/* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
out1 = __SMUADX(R, C2) >> 16U;
/* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
out2 = __SMUSD(__QSUB16(0, C2), R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* Reading i0+3fftLen/4 */
/* Read yb (real), xb(imag) input */
T = read_q15x2 (pSi1);
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
/* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi1 += 2 * n1;
/* Butterfly calculations */
/* Read yd (real), xd(imag) input */
U = read_q15x2 (pSi3);
/* T = packed(yb-yd, xb-xd) */
T = __QSUB16(T, U);
#ifndef ARM_MATH_BIG_ENDIAN
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __SHSAX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __SHASX(S, T);
/* Butterfly process for the i0+fftLen/2 sample */
out1 = __SMUSD(C1, S) >> 16U;
out2 = __SMUADX(C1, S);
#else
/* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
R = __SHASX(S, T);
/* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
S = __SHSAX(S, T);
/* Butterfly process for the i0+fftLen/2 sample */
out1 = __SMUADX(S, C1) >> 16U;
out2 = __SMUSD(__QSUB16(0, C1), S);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
/* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi2 += 2 * n1;
/* Butterfly process for the i0+3fftLen/4 sample */
#ifndef ARM_MATH_BIG_ENDIAN
out1 = __SMUSD(C3, R) >> 16U;
out2 = __SMUADX(C3, R);
#else
out1 = __SMUADX(C3, R) >> 16U;
out2 = __SMUSD(__QSUB16(0, C3), R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
/* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
/* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
pSi3 += 2 * n1;
}
}
/* Twiddle coefficients index modifier */
twidCoefModifier <<= 2U;
}
/* end of middle stage process */
/* data is in 10.6(q6) format for the 1024 point */
/* data is in 8.8(q8) format for the 256 point */
/* data is in 6.10(q10) format for the 64 point */
/* data is in 4.12(q12) format for the 16 point */
/* Initializations for the last stage */
j = fftLen >> 2;
ptr1 = &pSrc16[0];
/* start of last stage process */
/* Butterfly implementation */
do
{
/* Read xa (real), ya(imag) input */
xaya = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xb (real), yb(imag) input */
xbyb = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xc (real), yc(imag) input */
xcyc = read_q15x2_ia ((q15_t **) &ptr1);
/* Read xd (real), yd(imag) input */
xdyd = read_q15x2_ia ((q15_t **) &ptr1);
/* R = packed((ya + yc), (xa + xc)) */
R = __QADD16(xaya, xcyc);
/* T = packed((yb + yd), (xb + xd)) */
T = __QADD16(xbyb, xdyd);
/* pointer updation for writing */
ptr1 = ptr1 - 8U;
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
write_q15x2_ia (&ptr1, __SHADD16(R, T));
/* T = packed((yb + yd), (xb + xd)) */
T = __QADD16(xbyb, xdyd);
/* xc' = (xa-xb+xc-xd) */
/* yc' = (ya-yb+yc-yd) */
write_q15x2_ia (&ptr1, __SHSUB16(R, T));
/* S = packed((ya - yc), (xa - xc)) */
S = __QSUB16(xaya, xcyc);
/* Read yd (real), xd(imag) input */
/* T = packed( (yb - yd), (xb - xd)) */
U = __QSUB16(xbyb, xdyd);
#ifndef ARM_MATH_BIG_ENDIAN
/* xb' = (xa+yb-xc-yd) */
/* yb' = (ya-xb-yc+xd) */
write_q15x2_ia (&ptr1, __SHASX(S, U));
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
write_q15x2_ia (&ptr1, __SHSAX(S, U));
#else
/* xb' = (xa+yb-xc-yd) */
/* yb' = (ya-xb-yc+xd) */
write_q15x2_ia (&ptr1, __SHSAX(S, U));
/* xd' = (xa-yb-xc+yd) */
/* yd' = (ya+xb-yc-xd) */
write_q15x2_ia (&ptr1, __SHASX(S, U));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
} while (--j);
/* end of last stage process */
/* output is in 11.5(q5) format for the 1024 point */
/* output is in 9.7(q7) format for the 256 point */
/* output is in 7.9(q9) format for the 64 point */
/* output is in 5.11(q11) format for the 16 point */
#else /* arm_radix4_butterfly_inverse_q15 */
q15_t R0, R1, S0, S1, T0, T1, U0, U1;
q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
/* Total process is divided into three stages */
/* process first stage, middle stages, & last stage */
/* Initializations for the first stage */
n2 = fftLen;
n1 = n2;
/* n2 = fftLen/4 */
n2 >>= 2U;
/* Index for twiddle coefficient */
ic = 0U;
/* Index for input read and output write */
i0 = 0U;
j = n2;
/* Input is in 1.15(q15) format */
/* Start of first stage process */
do
{
/* Butterfly implementation */
/* index calculation for the input as, */
/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
i1 = i0 + n2;
i2 = i1 + n2;
i3 = i2 + n2;
/* Reading i0, i0+fftLen/2 inputs */
/* input is down scale by 4 to avoid overflow */
/* Read ya (real), xa(imag) input */
T0 = pSrc16[i0 * 2U] >> 2U;
T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
/* input is down scale by 4 to avoid overflow */
/* Read yc (real), xc(imag) input */
S0 = pSrc16[i2 * 2U] >> 2U;
S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
/* R0 = (ya + yc), R1 = (xa + xc) */
R0 = __SSAT(T0 + S0, 16U);
R1 = __SSAT(T1 + S1, 16U);
/* S0 = (ya - yc), S1 = (xa - xc) */
S0 = __SSAT(T0 - S0, 16U);
S1 = __SSAT(T1 - S1, 16U);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* input is down scale by 4 to avoid overflow */
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U] >> 2U;
T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
/* Read yd (real), xd(imag) input */
/* input is down scale by 4 to avoid overflow */
U0 = pSrc16[i3 * 2U] >> 2U;
U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
/* T0 = (yb + yd), T1 = (xb + xd) */
T0 = __SSAT(T0 + U0, 16U);
T1 = __SSAT(T1 + U1, 16U);
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
R0 = __SSAT(R0 - T0, 16U);
R1 = __SSAT(R1 - T1, 16U);
/* co2 & si2 are read from Coefficient pointer */
Co2 = pCoef16[2U * ic * 2U];
Si2 = pCoef16[(2U * ic * 2U) + 1U];
/* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
/* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
/* Reading i0+fftLen/4 */
/* input is down scale by 4 to avoid overflow */
/* T0 = yb, T1 = xb */
T0 = pSrc16[i1 * 2U] >> 2U;
T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
/* writing the butterfly processed i0 + fftLen/4 sample */
/* writing output(xc', yc') in little endian format */
pSrc16[i1 * 2U] = out1;
pSrc16[(i1 * 2U) + 1U] = out2;
/* Butterfly calculations */
/* input is down scale by 4 to avoid overflow */
/* U0 = yd, U1 = xd) */
U0 = pSrc16[i3 * 2U] >> 2U;
U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
/* T0 = yb-yd, T1 = xb-xd) */
T0 = __SSAT(T0 - U0, 16U);
T1 = __SSAT(T1 - U1, 16U);
/* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
/* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
/* co1 & si1 are read from Coefficient pointer */
Co1 = pCoef16[ic * 2U];
Si1 = pCoef16[(ic * 2U) + 1U];
/* Butterfly process for the i0+fftLen/2 sample */
/* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
/* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
/* writing output(xb', yb') in little endian format */
pSrc16[i2 * 2U] = out1;
pSrc16[(i2 * 2U) + 1U] = out2;
/* Co3 & si3 are read from Coefficient pointer */
Co3 = pCoef16[3U * ic * 2U];
Si3 = pCoef16[(3U * ic * 2U) + 1U];
/* Butterfly process for the i0+3fftLen/4 sample */
/* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
/* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
/* writing output(xd', yd') in little endian format */
pSrc16[i3 * 2U] = out1;
pSrc16[(i3 * 2U) + 1U] = out2;
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
/* Updating input index */
i0 = i0 + 1U;
} while (--j);
/* End of first stage process */
/* data is in 4.11(q11) format */
/* Start of Middle stage process */
/* Twiddle coefficients index modifier */
twidCoefModifier <<= 2U;
/* Calculation of Middle stage */
for (k = fftLen / 4U; k > 4U; k >>= 2U)
{
/* Initializations for the middle stage */
n1 = n2;
n2 >>= 2U;
ic = 0U;
for (j = 0U; j <= (n2 - 1U); j++)
{
/* index calculation for the coefficients */
Co1 = pCoef16[ic * 2U];
Si1 = pCoef16[(ic * 2U) + 1U];
Co2 = pCoef16[2U * ic * 2U];
Si2 = pCoef16[2U * ic * 2U + 1U];
Co3 = pCoef16[3U * ic * 2U];
Si3 = pCoef16[(3U * ic * 2U) + 1U];
/* Twiddle coefficients index modifier */
ic = ic + twidCoefModifier;
/* Butterfly implementation */
for (i0 = j; i0 < fftLen; i0 += n1)
{
/* index calculation for the input as, */
/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
i1 = i0 + n2;
i2 = i1 + n2;
i3 = i2 + n2;
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
T0 = pSrc16[i0 * 2U];
T1 = pSrc16[(i0 * 2U) + 1U];
/* Read yc (real), xc(imag) input */
S0 = pSrc16[i2 * 2U];
S1 = pSrc16[(i2 * 2U) + 1U];
/* R0 = (ya + yc), R1 = (xa + xc) */
R0 = __SSAT(T0 + S0, 16U);
R1 = __SSAT(T1 + S1, 16U);
/* S0 = (ya - yc), S1 = (xa - xc) */
S0 = __SSAT(T0 - S0, 16U);
S1 = __SSAT(T1 - S1, 16U);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U];
T1 = pSrc16[(i1 * 2U) + 1U];
/* Read yd (real), xd(imag) input */
U0 = pSrc16[i3 * 2U];
U1 = pSrc16[(i3 * 2U) + 1U];
/* T0 = (yb + yd), T1 = (xb + xd) */
T0 = __SSAT(T0 + U0, 16U);
T1 = __SSAT(T1 + U1, 16U);
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
R0 = (R0 >> 1U) - (T0 >> 1U);
R1 = (R1 >> 1U) - (T1 >> 1U);
/* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
/* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
/* Reading i0+3fftLen/4 */
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U];
T1 = pSrc16[(i1 * 2U) + 1U];
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
/* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
pSrc16[i1 * 2U] = out1;
pSrc16[(i1 * 2U) + 1U] = out2;
/* Butterfly calculations */
/* Read yd (real), xd(imag) input */
U0 = pSrc16[i3 * 2U];
U1 = pSrc16[(i3 * 2U) + 1U];
/* T0 = yb-yd, T1 = xb-xd) */
T0 = __SSAT(T0 - U0, 16U);
T1 = __SSAT(T1 - U1, 16U);
/* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
R0 = (S0 >> 1U) + (T1 >> 1U);
R1 = (S1 >> 1U) - (T0 >> 1U);
/* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
S0 = (S0 >> 1U) - (T1 >> 1U);
S1 = (S1 >> 1U) + (T0 >> 1U);
/* Butterfly process for the i0+fftLen/2 sample */
out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
/* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
/* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
pSrc16[i2 * 2U] = out1;
pSrc16[(i2 * 2U) + 1U] = out2;
/* Butterfly process for the i0+3fftLen/4 sample */
out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
/* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
/* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
pSrc16[i3 * 2U] = out1;
pSrc16[(i3 * 2U) + 1U] = out2;
}
}
/* Twiddle coefficients index modifier */
twidCoefModifier <<= 2U;
}
/* End of Middle stages process */
/* data is in 10.6(q6) format for the 1024 point */
/* data is in 8.8(q8) format for the 256 point */
/* data is in 6.10(q10) format for the 64 point */
/* data is in 4.12(q12) format for the 16 point */
/* start of last stage process */
/* Initializations for the last stage */
n1 = n2;
n2 >>= 2U;
/* Butterfly implementation */
for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
{
/* index calculation for the input as, */
/* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
i1 = i0 + n2;
i2 = i1 + n2;
i3 = i2 + n2;
/* Reading i0, i0+fftLen/2 inputs */
/* Read ya (real), xa(imag) input */
T0 = pSrc16[i0 * 2U];
T1 = pSrc16[(i0 * 2U) + 1U];
/* Read yc (real), xc(imag) input */
S0 = pSrc16[i2 * 2U];
S1 = pSrc16[(i2 * 2U) + 1U];
/* R0 = (ya + yc), R1 = (xa + xc) */
R0 = __SSAT(T0 + S0, 16U);
R1 = __SSAT(T1 + S1, 16U);
/* S0 = (ya - yc), S1 = (xa - xc) */
S0 = __SSAT(T0 - S0, 16U);
S1 = __SSAT(T1 - S1, 16U);
/* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U];
T1 = pSrc16[(i1 * 2U) + 1U];
/* Read yd (real), xd(imag) input */
U0 = pSrc16[i3 * 2U];
U1 = pSrc16[(i3 * 2U) + 1U];
/* T0 = (yb + yd), T1 = (xb + xd) */
T0 = __SSAT(T0 + U0, 16U);
T1 = __SSAT(T1 + U1, 16U);
/* writing the butterfly processed i0 sample */
/* xa' = xa + xb + xc + xd */
/* ya' = ya + yb + yc + yd */
pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
/* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
R0 = (R0 >> 1U) - (T0 >> 1U);
R1 = (R1 >> 1U) - (T1 >> 1U);
/* Read yb (real), xb(imag) input */
T0 = pSrc16[i1 * 2U];
T1 = pSrc16[(i1 * 2U) + 1U];
/* writing the butterfly processed i0 + fftLen/4 sample */
/* xc' = (xa-xb+xc-xd) */
/* yc' = (ya-yb+yc-yd) */
pSrc16[i1 * 2U] = R0;
pSrc16[(i1 * 2U) + 1U] = R1;
/* Read yd (real), xd(imag) input */
U0 = pSrc16[i3 * 2U];
U1 = pSrc16[(i3 * 2U) + 1U];
/* T0 = (yb - yd), T1 = (xb - xd) */
T0 = __SSAT(T0 - U0, 16U);
T1 = __SSAT(T1 - U1, 16U);
/* writing the butterfly processed i0 + fftLen/2 sample */
/* xb' = (xa-yb-xc+yd) */
/* yb' = (ya+xb-yc-xd) */
pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
/* writing the butterfly processed i0 + 3fftLen/4 sample */
/* xd' = (xa+yb-xc-yd) */
/* yd' = (ya-xb-yc+xd) */
pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
}
/* end of last stage process */
/* output is in 11.5(q5) format for the 1024 point */
/* output is in 9.7(q7) format for the 256 point */
/* output is in 7.9(q9) format for the 64 point */
/* output is in 5.11(q11) format for the 16 point */
#endif /* #if defined (ARM_MATH_DSP) */
}
extern void arm_radix4_butterfly_q15(
q15_t * pSrc,
uint32_t fftLen,
const q15_t * pCoef,
uint32_t twidCoefModifier);
extern void arm_radix4_butterfly_inverse_q15(
q15_t * pSrc,
uint32_t fftLen,
const q15_t * pCoef,
uint32_t twidCoefModifier);
extern void arm_bitreversal_16(
uint16_t * pSrc,
const uint16_t bitRevLen,
const uint16_t * pBitRevTable);
void arm_cfft_radix4by2_q15(
q15_t * pSrc,
uint32_t fftLen,
const q15_t * pCoef);
void arm_cfft_radix4by2_inverse_q15(
q15_t * pSrc,
uint32_t fftLen,
const q15_t * pCoef);
void arm_cfft_q15(
const arm_cfft_instance_q15 * S,
q15_t * p1,
uint8_t ifftFlag,
uint8_t bitReverseFlag)
{
uint32_t L = S->fftLen;
if (ifftFlag == 1U)
{
switch (L)
{
case 16:
case 64:
case 256:
case 1024:
case 4096:
arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
break;
}
}
else
{
switch (L)
{
case 16:
case 64:
case 256:
case 1024:
case 4096:
arm_radix4_butterfly_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
break;
case 32:
case 128:
case 512:
case 2048:
arm_cfft_radix4by2_q15 ( p1, L, S->pTwiddle );
break;
}
}
if ( bitReverseFlag )
arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable);
}
void arm_cfft_radix4by2_q15(
q15_t * pSrc,
uint32_t fftLen,
const q15_t * pCoef)
{
uint32_t i;
uint32_t n2;
q15_t p0, p1, p2, p3;
uint32_t l;
q15_t xt, yt, cosVal, sinVal;
n2 = fftLen >> 1U;
for (i = 0; i < n2; i++)
{
cosVal = pCoef[2 * i];
sinVal = pCoef[2 * i + 1];
l = i + n2;
xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) +
((int16_t) (((q31_t) yt * sinVal) >> 16U)) );
pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) -
((int16_t) (((q31_t) xt * sinVal) >> 16U)) );
}
arm_radix4_butterfly_q15( pSrc, n2, (q15_t*)pCoef, 2U);
/* second col */
arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
n2 = fftLen >> 1U;
for (i = 0; i < n2; i++)
{
p0 = pSrc[4 * i + 0];
p1 = pSrc[4 * i + 1];
p2 = pSrc[4 * i + 2];
p3 = pSrc[4 * i + 3];
p0 <<= 1U;
p1 <<= 1U;
p2 <<= 1U;
p3 <<= 1U;
pSrc[4 * i + 0] = p0;
pSrc[4 * i + 1] = p1;
pSrc[4 * i + 2] = p2;
pSrc[4 * i + 3] = p3;
}
}
void arm_cfft_radix4by2_inverse_q15(
q15_t * pSrc,
uint32_t fftLen,
const q15_t * pCoef)
{
uint32_t i;
uint32_t n2;
q15_t p0, p1, p2, p3;
uint32_t l;
q15_t xt, yt, cosVal, sinVal;
n2 = fftLen >> 1U;
for (i = 0; i < n2; i++)
{
cosVal = pCoef[2 * i];
sinVal = pCoef[2 * i + 1];
l = i + n2;
xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) -
((int16_t) (((q31_t) yt * sinVal) >> 16U)) );
pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) +
((int16_t) (((q31_t) xt * sinVal) >> 16U)) );
}
/* first col */
arm_radix4_butterfly_inverse_q15( pSrc, n2, (q15_t*)pCoef, 2U);
/* second col */
arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
n2 = fftLen >> 1U;
for (i = 0; i < n2; i++)
{
p0 = pSrc[4 * i + 0];
p1 = pSrc[4 * i + 1];
p2 = pSrc[4 * i + 2];
p3 = pSrc[4 * i + 3];
p0 <<= 1U;
p1 <<= 1U;
p2 <<= 1U;
p3 <<= 1U;
pSrc[4 * i + 0] = p0;
pSrc[4 * i + 1] = p1;
pSrc[4 * i + 2] = p2;
pSrc[4 * i + 3] = p3;
}
}
代码使用查表法对FFT的蝶形运算进行优化,对于32点的FFT算法运行时间390us,64点的FFT算法930us。相对使用C语言实现的FFT没有优化前的速度提高了100倍,算法的强大。空间占有为6kb~7kb之间。满足项目需求。
误差分析
由于使用是Q15类型的FFT算法而不是浮点型算法,空间减少许多和速度大大加快,但是算法的误差必然是一个问题。这是一个难以避免的问题。使用matlab代码对算法的误差进行分析。
matlab的代码如下:
clear;
clc;
% Test data initialization
N = 1:64;
sample_frequecy = 3000;
test_data = 800*sin(2*3.14*100*N/sample_frequecy);
test_data = round(test_data);
plot(test_data);
% fft result
fft_data = fft(test_data);
[max_value,max_index] = max(fft_data(1:33));
% Drawing FFT spectrum
plot(N,abs(fft_data)/64);
data_real = real(fft_data);
data_imag = imag(fft_data);
和单片机运行的结果进行对比,当峰值点时,误差在5%以下时可以保证的,并且当峰值越大,误差越小。而值越小,误差越大。
所以如果计算最大值的位置这种应用,Q15类型FFT完全可以胜任。比如这次的项目,单目标的测距和测速。需要就是最大点的位置信息而不是幅度信息。
参考
【STM32H7的DSP教程】第6章 ARM DSP源码和库移植方法(MDK5的AC5和AC6)(https://blog.csdn.net/Simon223/article/details/105311500)