fft_3dn2.c

Go to the documentation of this file.
00001 /*
00002  * FFT/MDCT transform with Extended 3DNow! optimizations
00003  * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
00004  * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 
00026 static const int p1m1[2] __attribute__((aligned(8))) =
00027     { 0, 1 << 31 };
00028 
00029 static const int m1p1[2] __attribute__((aligned(8))) =
00030     { 1 << 31, 0 };
00031 
00032 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
00033 {
00034     int ln = s->nbits;
00035     long j;
00036     x86_reg i;
00037     long nblocks, nloops;
00038     FFTComplex *p, *cptr;
00039 
00040     asm volatile(
00041         /* FEMMS is not a must here but recommended by AMD */
00042         "femms \n\t"
00043         "movq %0, %%mm7 \n\t"
00044         ::"m"(*(s->inverse ? m1p1 : p1m1))
00045     );
00046 
00047     i = 8 << ln;
00048     asm volatile(
00049         "1: \n\t"
00050         "sub $32, %0 \n\t"
00051         "movq    (%0,%1), %%mm0 \n\t"
00052         "movq  16(%0,%1), %%mm1 \n\t"
00053         "movq   8(%0,%1), %%mm2 \n\t"
00054         "movq  24(%0,%1), %%mm3 \n\t"
00055         "movq      %%mm0, %%mm4 \n\t"
00056         "movq      %%mm1, %%mm5 \n\t"
00057         "pfadd     %%mm2, %%mm0 \n\t"
00058         "pfadd     %%mm3, %%mm1 \n\t"
00059         "pfsub     %%mm2, %%mm4 \n\t"
00060         "pfsub     %%mm3, %%mm5 \n\t"
00061         "movq      %%mm0, %%mm2 \n\t"
00062         "pswapd    %%mm5, %%mm5 \n\t"
00063         "movq      %%mm4, %%mm3 \n\t"
00064         "pxor      %%mm7, %%mm5 \n\t"
00065         "pfadd     %%mm1, %%mm0 \n\t"
00066         "pfadd     %%mm5, %%mm4 \n\t"
00067         "pfsub     %%mm1, %%mm2 \n\t"
00068         "pfsub     %%mm5, %%mm3 \n\t"
00069         "movq      %%mm0,   (%0,%1) \n\t"
00070         "movq      %%mm4,  8(%0,%1) \n\t"
00071         "movq      %%mm2, 16(%0,%1) \n\t"
00072         "movq      %%mm3, 24(%0,%1) \n\t"
00073         "jg 1b \n\t"
00074         :"+r"(i)
00075         :"r"(z)
00076     );
00077     /* pass 2 .. ln-1 */
00078 
00079     nblocks = 1 << (ln-3);
00080     nloops = 1 << 2;
00081     cptr = s->exptab1;
00082     do {
00083         p = z;
00084         j = nblocks;
00085         do {
00086             i = nloops*8;
00087             asm volatile(
00088                 "1: \n\t"
00089                 "sub $16, %0 \n\t"
00090                 "movq    (%1,%0), %%mm0 \n\t"
00091                 "movq   8(%1,%0), %%mm1 \n\t"
00092                 "movq    (%2,%0), %%mm2 \n\t"
00093                 "movq   8(%2,%0), %%mm3 \n\t"
00094                 "movq  (%3,%0,2), %%mm4 \n\t"
00095                 "movq 8(%3,%0,2), %%mm5 \n\t"
00096                 "pswapd    %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3]
00097                 "pswapd    %%mm5, %%mm7 \n\t"
00098                 "pfmul     %%mm2, %%mm4 \n\t" // cre*re cim*im
00099                 "pfmul     %%mm3, %%mm5 \n\t"
00100                 "pfmul     %%mm2, %%mm6 \n\t" // cim*re cre*im
00101                 "pfmul     %%mm3, %%mm7 \n\t"
00102                 "pfpnacc   %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
00103                 "pfpnacc   %%mm7, %%mm5 \n\t"
00104                 "movq      %%mm0, %%mm2 \n\t"
00105                 "movq      %%mm1, %%mm3 \n\t"
00106                 "pfadd     %%mm4, %%mm0 \n\t"
00107                 "pfadd     %%mm5, %%mm1 \n\t"
00108                 "pfsub     %%mm4, %%mm2 \n\t"
00109                 "pfsub     %%mm5, %%mm3 \n\t"
00110                 "movq      %%mm0,  (%1,%0) \n\t"
00111                 "movq      %%mm1, 8(%1,%0) \n\t"
00112                 "movq      %%mm2,  (%2,%0) \n\t"
00113                 "movq      %%mm3, 8(%2,%0) \n\t"
00114                 "jg 1b \n\t"
00115                 :"+r"(i)
00116                 :"r"(p), "r"(p + nloops), "r"(cptr)
00117             );
00118             p += nloops*2;
00119         } while (--j);
00120         cptr += nloops*2;
00121         nblocks >>= 1;
00122         nloops <<= 1;
00123     } while (nblocks != 0);
00124     asm volatile("femms");
00125 }
00126 
00127 static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
00128 {
00129     long n4, n2, n;
00130     x86_reg k;
00131     const uint16_t *revtab = s->fft.revtab;
00132     const FFTSample *tcos = s->tcos;
00133     const FFTSample *tsin = s->tsin;
00134     const FFTSample *in1, *in2;
00135     FFTComplex *z = (FFTComplex *)tmp;
00136 
00137     n = 1 << s->nbits;
00138     n2 = n >> 1;
00139     n4 = n >> 2;
00140 
00141     /* pre rotation */
00142     in1 = input;
00143     in2 = input + n2 - 1;
00144     for(k = 0; k < n4; k++) {
00145         // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
00146         asm volatile(
00147             "movd       %0, %%mm0 \n\t"
00148             "movd       %2, %%mm1 \n\t"
00149             "punpckldq  %1, %%mm0 \n\t"
00150             "punpckldq  %3, %%mm1 \n\t"
00151             "movq    %%mm0, %%mm2 \n\t"
00152             "pfmul   %%mm1, %%mm0 \n\t"
00153             "pswapd  %%mm1, %%mm1 \n\t"
00154             "pfmul   %%mm1, %%mm2 \n\t"
00155             "pfpnacc %%mm2, %%mm0 \n\t"
00156             ::"m"(in2[-2*k]), "m"(in1[2*k]),
00157               "m"(tcos[k]), "m"(tsin[k])
00158         );
00159         asm volatile(
00160             "movq    %%mm0, %0    \n\t"
00161             :"=m"(z[revtab[k]])
00162         );
00163     }
00164 
00165     ff_fft_calc(&s->fft, z);
00166 
00167     /* post rotation + reordering */
00168     for(k = 0; k < n4; k++) {
00169         asm volatile(
00170             "movq       %0, %%mm0 \n\t"
00171             "movd       %1, %%mm1 \n\t"
00172             "punpckldq  %2, %%mm1 \n\t"
00173             "movq    %%mm0, %%mm2 \n\t"
00174             "pfmul   %%mm1, %%mm0 \n\t"
00175             "pswapd  %%mm1, %%mm1 \n\t"
00176             "pfmul   %%mm1, %%mm2 \n\t"
00177             "pfpnacc %%mm2, %%mm0 \n\t"
00178             "movq    %%mm0, %0    \n\t"
00179             :"+m"(z[k])
00180             :"m"(tcos[k]), "m"(tsin[k])
00181         );
00182     }
00183 }
00184 
00185 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
00186                         const FFTSample *input, FFTSample *tmp)
00187 {
00188     x86_reg k;
00189     long n8, n2, n;
00190     FFTComplex *z = (FFTComplex *)tmp;
00191 
00192     n = 1 << s->nbits;
00193     n2 = n >> 1;
00194     n8 = n >> 3;
00195 
00196     imdct_3dn2(s, input, tmp);
00197 
00198     k = n-8;
00199     asm volatile("movd %0, %%mm7" ::"r"(1<<31));
00200     asm volatile(
00201         "1: \n\t"
00202         "movq    (%4,%0), %%mm0 \n\t" // z[n8+k]
00203         "neg %0 \n\t"
00204         "pswapd -8(%4,%0), %%mm1 \n\t" // z[n8-1-k]
00205         "movq      %%mm0, %%mm2 \n\t"
00206         "pxor      %%mm7, %%mm2 \n\t"
00207         "punpckldq %%mm1, %%mm2 \n\t"
00208         "pswapd    %%mm2, %%mm3 \n\t"
00209         "punpckhdq %%mm1, %%mm0 \n\t"
00210         "pswapd    %%mm0, %%mm4 \n\t"
00211         "pxor      %%mm7, %%mm0 \n\t"
00212         "pxor      %%mm7, %%mm4 \n\t"
00213         "movq      %%mm3, -8(%3,%0) \n\t" // output[n-2-2*k] = { z[n8-1-k].im, -z[n8+k].re }
00214         "movq      %%mm4, -8(%2,%0) \n\t" // output[n2-2-2*k]= { -z[n8-1-k].re, z[n8+k].im }
00215         "neg %0 \n\t"
00216         "movq      %%mm0, (%1,%0) \n\t"   // output[2*k]     = { -z[n8+k].im, z[n8-1-k].re }
00217         "movq      %%mm2, (%2,%0) \n\t"   // output[n2+2*k]  = { -z[n8+k].re, z[n8-1-k].im }
00218         "sub $8, %0 \n\t"
00219         "jge 1b \n\t"
00220         :"+r"(k)
00221         :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8)
00222         :"memory"
00223     );
00224     asm volatile("femms");
00225 }
00226 
00227 void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
00228                         const FFTSample *input, FFTSample *tmp)
00229 {
00230     x86_reg j, k;
00231     long n8, n4, n;
00232     FFTComplex *z = (FFTComplex *)tmp;
00233 
00234     n = 1 << s->nbits;
00235     n4 = n >> 2;
00236     n8 = n >> 3;
00237 
00238     imdct_3dn2(s, input, tmp);
00239 
00240     j = -n;
00241     k = n-8;
00242     asm volatile("movd %0, %%mm7" ::"r"(1<<31));
00243     asm volatile(
00244         "1: \n\t"
00245         "movq    (%3,%1), %%mm0 \n\t" // z[n8+k]
00246         "pswapd  (%3,%0), %%mm1 \n\t" // z[n8-1-k]
00247         "movq      %%mm0, %%mm2 \n\t"
00248         "punpckldq %%mm1, %%mm0 \n\t"
00249         "punpckhdq %%mm2, %%mm1 \n\t"
00250         "pxor      %%mm7, %%mm0 \n\t"
00251         "pxor      %%mm7, %%mm1 \n\t"
00252         "movq      %%mm0, (%2,%1) \n\t" // output[n4+2*k]   = { -z[n8+k].re, z[n8-1-k].im }
00253         "movq      %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im }
00254         "sub $8, %1 \n\t"
00255         "add $8, %0 \n\t"
00256         "jl 1b \n\t"
00257         :"+r"(j), "+r"(k)
00258         :"r"(output+n4), "r"(z+n8)
00259         :"memory"
00260     );
00261     asm volatile("femms");
00262 }
00263 

Generated on Fri Jan 9 15:44:29 2009 for libextractor by  doxygen 1.5.1