Code snippet: Fast Fourier Transform

Fast Fourier Transform
Doing it the fast way. O(N log(N)) operations.
The derivation of the algorithm in doFFT() comes from the following two iterations of the code:
uc-fft-v1.c.bak file
uc-fft-v2.c.bak file
Compiles clean (or should) with: gcc -o uc-fft uc-fft.c -Wall
/** (c)2008 Max Vilimpoc, All Rights Reserved */

/** 
 * FFT Module, works for N up to 256, after which
 * the bitshift has to be modified.
 */

#ifndef __UC_PRODUCTION__
#include <stdio.h>
#endif

#include <stdint.h>
#include <math.h>
#include <string.h>

enum TE_CONSTS
{
    N       = 64,
    M       = N/2,
};

typedef struct
{
    float re;
    float im; 
} CPLX;

static uint8_t LOG2N;

static CPLX Xf[N];
static CPLX Tw[M];

/** Complex math functions, because C sucks. */
static void radAngleToCplx(CPLX * const out, float radAngle)
{
    out->re =  cos(radAngle);
    out->im = -sin(radAngle);
}

static void cplxAddInPlace(CPLX * inoutA, CPLX * inB)
{
    inoutA->re += inB->re;
    inoutA->im += inB->im;
}

static CPLX cplxSub(CPLX * inA, CPLX * inB)
{
    CPLX out;
    
    out.re = inA->re - inB->re;
    out.im = inA->im - inB->im;
    
    return out;
}

static CPLX cplxMult(const CPLX * const inA, const CPLX * const inB)
{
    CPLX out;
    
    out.re = (inA->re * inB->re) - (inA->im * inB->im);
    out.im = (inA->re * inB->im) + (inA->im * inB->re);
    
    return out;
}

static void cplxPrintf(const CPLX * const in)
{
    printf("%+3.3f + %+3.3f\n", (float) in->re, (float) in->im);
}

static void dumpArray(const CPLX * inA, uint32_t length)
{
    uint32_t n8;
    
    for (n8 = 0; n8 < length; ++n8)
    {
        /* Output sample array normal order. */
        cplxPrintf(inA++);
    }
}


static void bitReverseSamples(void)
{
    uint8_t n8, r8;
    CPLX temp;

    dumpArray(Xf, N);

    for (n8 = 0; n8 < M; ++n8)
    {
        r8  = 0;
        r8 |= (n8 & 0x80) >> 7;
        r8 |= (n8 & 0x40) >> 5;
        r8 |= (n8 & 0x20) >> 3;
        r8 |= (n8 & 0x10) >> 1;
        r8 |= (n8 & 0x08) << 1;
        r8 |= (n8 & 0x04) << 3;
        r8 |= (n8 & 0x02) << 5;
        r8 |= (n8 & 0x01) << 7;
        r8  = r8 >> (8 - LOG2N);
        
        temp   = Xf[r8];
        Xf[r8] = Xf[n8];
        Xf[n8] = temp;   
        
        /* this could be done w/rotate left/right plus carry in 16 cycles. */
        
        printf("n8: %d, r8: %d\n", n8, r8);
    }
    
    dumpArray(Xf, N);
}

static void butterfly(CPLX * const XfA, CPLX * const XfB, CPLX * const tw)
{
    CPLX XfB2 = cplxMult(XfB, tw);

    // XfB = XfA - XfB * tw;
    *XfB = cplxSub(XfA, &XfB2);

    // XfA = XfA + XfB * tw;
    cplxAddInPlace(XfA, &XfB2);
}


static void initTwiddles()
{
    LOG2N = log10(N) / log10(2);
    printf("LOG2N: %u\n", LOG2N);
    
    uint8_t k;
    double angle; 
    for (k = 0; k < M; ++k)
    {
        angle = 2 * M_PI * k / N;
        printf("angle = %d / %d * 2PI = %f ", k, N, angle);
        radAngleToCplx(&Tw[k], angle);
        cplxPrintf(&Tw[k]);
    }
}

static void initSamples()
{
    Xf[0].re = 1;
    Xf[1].re = 1;
}

void doFFT(uint8_t numPts)
{
    uint8_t LOG2N      = log10(numPts) / log10(2);
    uint8_t maxAffineA = LOG2N - 1;
    uint8_t pass, i, down;
    uint8_t limitA, limitB, limitC;
    
    for (pass = 0; pass < LOG2N; ++pass)
    {
        limitA = 1 << (maxAffineA - pass);
        limitB = 1 <<  pass;
        limitC = limitB << 1;

        for (i = 0; i < limitA; ++i)
        {
            for (down = 0; down < limitB; ++down)
            {
                butterfly(&Xf[limitC * i + down], &Xf[limitC * i + down + limitB], &Tw[down]);
            }
        }
    }
}


int main(int argc, char **argv)
{
    initTwiddles();
    initSamples();
    
    bitReverseSamples();
    
    printf("sizeof(Xf): %lu\n", sizeof(Xf));
    
    printf("Xf:\n");
    dumpArray(Xf, N);
    printf("Tw:\n");
    dumpArray(Tw, M);
    
    // FFT full-length reconstruction.

/*
    // 1pt -> 2pt DFTs. 
    for (i = 0; i < (1 << 3); ++i)
    {
        for (down = 0; down < (1 << 0); ++down)
        {
            butterfly(&Xf[(1 << 1) * i + down], &Xf[(1 << 1) * i + down + (1 << 0)], &Tw[down]);
        }
    }
    printf("Xf (2pts done):\n");
    dumpArray(Xf, N);

    // 2pt -> 4pt DFTs. 
    for (i = 0; i < (1 << 2); ++i)
    {
        for (down = 0; down < (1 << 1); ++down)
        {
            butterfly(&Xf[(1 << 2) * i + down],  &Xf[(1 << 2) * i + down + (1 << 1)],   &Tw[down]);
        }
    }
    printf("Xf (4pts done):\n");
    dumpArray(Xf, N);

    // 4pt -> 8pt DFTs. 
    for (i = 0; i < (1 << 1); ++i)
    {
        for (down = 0; down < (1 << 2); ++down)
        {
            butterfly(&Xf[(1 << 3) * i + down],  &Xf[(1 << 3) * i + down + (1 << 2)],   &Tw[down]);
        }
    }
    printf("Xf (8pts done):\n");
    dumpArray(Xf, N);

    // 8 pt -> 16pt DFTs. 
    for (i = 0; i < (1 << 0); ++i)
    {
        for (down = 0; down < (1 << 3); ++down)
        {
            butterfly(&Xf[(1 << 4) * i + down],  &Xf[(1 << 4) * i + down + (1 << 3)],   &Tw[down]);
        }
    }
    printf("Xf (16pts done):\n");
    dumpArray(Xf, N);
*/

    doFFT(N);

    printf("Xf (%d pts done):\n", N);
    dumpArray(Xf, N);

    return 0;
}