fft_altivec.c

Go to the documentation of this file.
00001 /*
00002  * FFT/IFFT transforms
00003  * AltiVec-enabled
00004  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
00005  * Based on code Copyright (c) 2002 Fabrice Bellard.
00006  *
00007  * This file is part of FFmpeg.
00008  *
00009  * FFmpeg is free software; you can redistribute it and/or
00010  * modify it under the terms of the GNU Lesser General Public
00011  * License as published by the Free Software Foundation; either
00012  * version 2.1 of the License, or (at your option) any later version.
00013  *
00014  * FFmpeg is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00017  * Lesser General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU Lesser General Public
00020  * License along with FFmpeg; if not, write to the Free Software
00021  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00022  */
00023 #include "libavcodec/dsputil.h"
00024 
00025 #include "gcc_fixes.h"
00026 
00027 #include "dsputil_ppc.h"
00028 #include "util_altivec.h"
00029 /*
00030   those three macros are from libavcodec/fft.c
00031   and are required for the reference C code
00032 */
00033 /* butter fly op */
00034 #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
00035 {\
00036   FFTSample ax, ay, bx, by;\
00037   bx=pre1;\
00038   by=pim1;\
00039   ax=qre1;\
00040   ay=qim1;\
00041   pre = (bx + ax);\
00042   pim = (by + ay);\
00043   qre = (bx - ax);\
00044   qim = (by - ay);\
00045 }
00046 #define MUL16(a,b) ((a) * (b))
00047 #define CMUL(pre, pim, are, aim, bre, bim) \
00048 {\
00049    pre = (MUL16(are, bre) - MUL16(aim, bim));\
00050    pim = (MUL16(are, bim) + MUL16(bre, aim));\
00051 }
00052 
00053 
00054 /**
00055  * Do a complex FFT with the parameters defined in ff_fft_init(). The
00056  * input data must be permuted before with s->revtab table. No
00057  * 1.0/sqrt(n) normalization is done.
00058  * AltiVec-enabled
00059  * This code assumes that the 'z' pointer is 16 bytes-aligned
00060  * It also assumes all FFTComplex are 8 bytes-aligned pair of float
00061  * The code is exactly the same as the SSE version, except
00062  * that successive MUL + ADD/SUB have been merged into
00063  * fused multiply-add ('vec_madd' in altivec)
00064  */
00065 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
00066 {
00067 POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
00068     register const vector float vczero = (const vector float)vec_splat_u32(0.);
00069 
00070     int ln = s->nbits;
00071     int j, np, np2;
00072     int nblocks, nloops;
00073     register FFTComplex *p, *q;
00074     FFTComplex *cptr, *cptr1;
00075     int k;
00076 
00077 POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
00078 
00079     np = 1 << ln;
00080 
00081     {
00082         vector float *r, a, b, a1, c1, c2;
00083 
00084         r = (vector float *)&z[0];
00085 
00086         c1 = vcii(p,p,n,n);
00087 
00088         if (s->inverse)
00089             {
00090                 c2 = vcii(p,p,n,p);
00091             }
00092         else
00093             {
00094                 c2 = vcii(p,p,p,n);
00095             }
00096 
00097         j = (np >> 2);
00098         do {
00099             a = vec_ld(0, r);
00100             a1 = vec_ld(sizeof(vector float), r);
00101 
00102             b = vec_perm(a,a,vcprmle(1,0,3,2));
00103             a = vec_madd(a,c1,b);
00104             /* do the pass 0 butterfly */
00105 
00106             b = vec_perm(a1,a1,vcprmle(1,0,3,2));
00107             b = vec_madd(a1,c1,b);
00108             /* do the pass 0 butterfly */
00109 
00110             /* multiply third by -i */
00111             b = vec_perm(b,b,vcprmle(2,3,1,0));
00112 
00113             /* do the pass 1 butterfly */
00114             vec_st(vec_madd(b,c2,a), 0, r);
00115             vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
00116 
00117             r += 2;
00118         } while (--j != 0);
00119     }
00120     /* pass 2 .. ln-1 */
00121 
00122     nblocks = np >> 3;
00123     nloops = 1 << 2;
00124     np2 = np >> 1;
00125 
00126     cptr1 = s->exptab1;
00127     do {
00128         p = z;
00129         q = z + nloops;
00130         j = nblocks;
00131         do {
00132             cptr = cptr1;
00133             k = nloops >> 1;
00134             do {
00135                 vector float a,b,c,t1;
00136 
00137                 a = vec_ld(0, (float*)p);
00138                 b = vec_ld(0, (float*)q);
00139 
00140                 /* complex mul */
00141                 c = vec_ld(0, (float*)cptr);
00142                 /*  cre*re cim*re */
00143                 t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
00144                 c = vec_ld(sizeof(vector float), (float*)cptr);
00145                 /*  -cim*im cre*im */
00146                 b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
00147 
00148                 /* butterfly */
00149                 vec_st(vec_add(a,b), 0, (float*)p);
00150                 vec_st(vec_sub(a,b), 0, (float*)q);
00151 
00152                 p += 2;
00153                 q += 2;
00154                 cptr += 4;
00155             } while (--k);
00156 
00157             p += nloops;
00158             q += nloops;
00159         } while (--j);
00160         cptr1 += nloops * 2;
00161         nblocks = nblocks >> 1;
00162         nloops = nloops << 1;
00163     } while (nblocks != 0);
00164 
00165 POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
00166 }

Generated on Fri Jan 9 12:44:29 2009 for libextractor by  doxygen 1.5.1