fft_sse.c

Go to the documentation of this file.
00001 /*
00002  * FFT/MDCT transform with SSE optimizations
00003  * Copyright (c) 2002 Fabrice Bellard.
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/dsputil.h"
00024 
00025 static const int p1p1p1m1[4] __attribute__((aligned(16))) =
00026     { 0, 0, 0, 1 << 31 };
00027 
00028 static const int p1p1m1p1[4] __attribute__((aligned(16))) =
00029     { 0, 0, 1 << 31, 0 };
00030 
00031 static const int p1p1m1m1[4] __attribute__((aligned(16))) =
00032     { 0, 0, 1 << 31, 1 << 31 };
00033 
00034 static const int p1m1p1m1[4] __attribute__((aligned(16))) =
00035     { 0, 1 << 31, 0, 1 << 31 };
00036 
00037 static const int m1m1m1m1[4] __attribute__((aligned(16))) =
00038     { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
00039 
00040 #if 0
00041 static void print_v4sf(const char *str, __m128 a)
00042 {
00043     float *p = (float *)&a;
00044     printf("%s: %f %f %f %f\n",
00045            str, p[0], p[1], p[2], p[3]);
00046 }
00047 #endif
00048 
00049 /* XXX: handle reverse case */
00050 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
00051 {
00052     int ln = s->nbits;
00053     x86_reg i;
00054     long j;
00055     long nblocks, nloops;
00056     FFTComplex *p, *cptr;
00057 
00058     asm volatile(
00059         "movaps %0, %%xmm4 \n\t"
00060         "movaps %1, %%xmm5 \n\t"
00061         ::"m"(*p1p1m1m1),
00062           "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
00063     );
00064 
00065     i = 8 << ln;
00066     asm volatile(
00067         "1: \n\t"
00068         "sub $32, %0 \n\t"
00069         /* do the pass 0 butterfly */
00070         "movaps   (%0,%1), %%xmm0 \n\t"
00071         "movaps    %%xmm0, %%xmm1 \n\t"
00072         "shufps     $0x4E, %%xmm0, %%xmm0 \n\t"
00073         "xorps     %%xmm4, %%xmm1 \n\t"
00074         "addps     %%xmm1, %%xmm0 \n\t"
00075         "movaps 16(%0,%1), %%xmm2 \n\t"
00076         "movaps    %%xmm2, %%xmm3 \n\t"
00077         "shufps     $0x4E, %%xmm2, %%xmm2 \n\t"
00078         "xorps     %%xmm4, %%xmm3 \n\t"
00079         "addps     %%xmm3, %%xmm2 \n\t"
00080         /* multiply third by -i */
00081         /* by toggling the sign bit */
00082         "shufps     $0xB4, %%xmm2, %%xmm2 \n\t"
00083         "xorps     %%xmm5, %%xmm2 \n\t"
00084         /* do the pass 1 butterfly */
00085         "movaps    %%xmm0, %%xmm1 \n\t"
00086         "addps     %%xmm2, %%xmm0 \n\t"
00087         "subps     %%xmm2, %%xmm1 \n\t"
00088         "movaps    %%xmm0,   (%0,%1) \n\t"
00089         "movaps    %%xmm1, 16(%0,%1) \n\t"
00090         "jg 1b \n\t"
00091         :"+r"(i)
00092         :"r"(z)
00093     );
00094     /* pass 2 .. ln-1 */
00095 
00096     nblocks = 1 << (ln-3);
00097     nloops = 1 << 2;
00098     cptr = s->exptab1;
00099     do {
00100         p = z;
00101         j = nblocks;
00102         do {
00103             i = nloops*8;
00104             asm volatile(
00105                 "1: \n\t"
00106                 "sub $32, %0 \n\t"
00107                 "movaps    (%2,%0), %%xmm1 \n\t"
00108                 "movaps    (%1,%0), %%xmm0 \n\t"
00109                 "movaps  16(%2,%0), %%xmm5 \n\t"
00110                 "movaps  16(%1,%0), %%xmm4 \n\t"
00111                 "movaps     %%xmm1, %%xmm2 \n\t"
00112                 "movaps     %%xmm5, %%xmm6 \n\t"
00113                 "shufps      $0xA0, %%xmm1, %%xmm1 \n\t"
00114                 "shufps      $0xF5, %%xmm2, %%xmm2 \n\t"
00115                 "shufps      $0xA0, %%xmm5, %%xmm5 \n\t"
00116                 "shufps      $0xF5, %%xmm6, %%xmm6 \n\t"
00117                 "mulps   (%3,%0,2), %%xmm1 \n\t" //  cre*re cim*re
00118                 "mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im
00119                 "mulps 32(%3,%0,2), %%xmm5 \n\t" //  cre*re cim*re
00120                 "mulps 48(%3,%0,2), %%xmm6 \n\t" // -cim*im cre*im
00121                 "addps      %%xmm2, %%xmm1 \n\t"
00122                 "addps      %%xmm6, %%xmm5 \n\t"
00123                 "movaps     %%xmm0, %%xmm3 \n\t"
00124                 "movaps     %%xmm4, %%xmm7 \n\t"
00125                 "addps      %%xmm1, %%xmm0 \n\t"
00126                 "subps      %%xmm1, %%xmm3 \n\t"
00127                 "addps      %%xmm5, %%xmm4 \n\t"
00128                 "subps      %%xmm5, %%xmm7 \n\t"
00129                 "movaps     %%xmm0, (%1,%0) \n\t"
00130                 "movaps     %%xmm3, (%2,%0) \n\t"
00131                 "movaps     %%xmm4, 16(%1,%0) \n\t"
00132                 "movaps     %%xmm7, 16(%2,%0) \n\t"
00133                 "jg 1b \n\t"
00134                 :"+r"(i)
00135                 :"r"(p), "r"(p + nloops), "r"(cptr)
00136             );
00137             p += nloops*2;
00138         } while (--j);
00139         cptr += nloops*2;
00140         nblocks >>= 1;
00141         nloops <<= 1;
00142     } while (nblocks != 0);
00143 }
00144 
00145 static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
00146 {
00147     x86_reg k;
00148     long n4, n2, n;
00149     const uint16_t *revtab = s->fft.revtab;
00150     const FFTSample *tcos = s->tcos;
00151     const FFTSample *tsin = s->tsin;
00152     const FFTSample *in1, *in2;
00153     FFTComplex *z = (FFTComplex *)tmp;
00154 
00155     n = 1 << s->nbits;
00156     n2 = n >> 1;
00157     n4 = n >> 2;
00158 
00159 #ifdef ARCH_X86_64
00160     asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1));
00161 #define P1M1P1M1 "%%xmm8"
00162 #else
00163 #define P1M1P1M1 "%4"
00164 #endif
00165 
00166     /* pre rotation */
00167     in1 = input;
00168     in2 = input + n2 - 4;
00169 
00170     /* Complex multiplication */
00171     for (k = 0; k < n4; k += 4) {
00172         asm volatile (
00173             "movaps          %0, %%xmm0 \n\t"   // xmm0 = r0 X  r1 X : in2
00174             "movaps          %1, %%xmm3 \n\t"   // xmm3 = X  i1 X  i0: in1
00175             "movaps    -16+1*%0, %%xmm4 \n\t"   // xmm4 = r0 X  r1 X : in2
00176             "movaps     16+1*%1, %%xmm7 \n\t"   // xmm7 = X  i1 X  i0: in1
00177             "movlps          %2, %%xmm1 \n\t"   // xmm1 = X  X  R1 R0: tcos
00178             "movlps          %3, %%xmm2 \n\t"   // xmm2 = X  X  I1 I0: tsin
00179             "movlps      8+1*%2, %%xmm5 \n\t"   // xmm5 = X  X  R1 R0: tcos
00180             "movlps      8+1*%3, %%xmm6 \n\t"   // xmm6 = X  X  I1 I0: tsin
00181             "shufps $95, %%xmm0, %%xmm0 \n\t"   // xmm0 = r1 r1 r0 r0
00182             "shufps $160,%%xmm3, %%xmm3 \n\t"   // xmm3 = i1 i1 i0 i0
00183             "shufps $95, %%xmm4, %%xmm4 \n\t"   // xmm4 = r1 r1 r0 r0
00184             "shufps $160,%%xmm7, %%xmm7 \n\t"   // xmm7 = i1 i1 i0 i0
00185             "unpcklps    %%xmm2, %%xmm1 \n\t"   // xmm1 = I1 R1 I0 R0
00186             "unpcklps    %%xmm6, %%xmm5 \n\t"   // xmm5 = I1 R1 I0 R0
00187             "movaps      %%xmm1, %%xmm2 \n\t"   // xmm2 = I1 R1 I0 R0
00188             "movaps      %%xmm5, %%xmm6 \n\t"   // xmm6 = I1 R1 I0 R0
00189             "xorps   "P1M1P1M1", %%xmm2 \n\t"   // xmm2 = -I1 R1 -I0 R0
00190             "xorps   "P1M1P1M1", %%xmm6 \n\t"   // xmm6 = -I1 R1 -I0 R0
00191             "mulps       %%xmm1, %%xmm0 \n\t"   // xmm0 = rI rR rI rR
00192             "mulps       %%xmm5, %%xmm4 \n\t"   // xmm4 = rI rR rI rR
00193             "shufps $177,%%xmm2, %%xmm2 \n\t"   // xmm2 = R1 -I1 R0 -I0
00194             "shufps $177,%%xmm6, %%xmm6 \n\t"   // xmm6 = R1 -I1 R0 -I0
00195             "mulps       %%xmm2, %%xmm3 \n\t"   // xmm3 = Ri -Ii Ri -Ii
00196             "mulps       %%xmm6, %%xmm7 \n\t"   // xmm7 = Ri -Ii Ri -Ii
00197             "addps       %%xmm3, %%xmm0 \n\t"   // xmm0 = result
00198             "addps       %%xmm7, %%xmm4 \n\t"   // xmm4 = result
00199             ::"m"(in2[-2*k]), "m"(in1[2*k]),
00200               "m"(tcos[k]), "m"(tsin[k])
00201 #ifndef ARCH_X86_64
00202               ,"m"(*p1m1p1m1)
00203 #endif
00204         );
00205         /* Should be in the same block, hack for gcc2.95 & gcc3 */
00206         asm (
00207             "movlps      %%xmm0, %0     \n\t"
00208             "movhps      %%xmm0, %1     \n\t"
00209             "movlps      %%xmm4, %2     \n\t"
00210             "movhps      %%xmm4, %3     \n\t"
00211             :"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]]),
00212              "=m"(z[revtab[k + 2]]), "=m"(z[revtab[k + 3]])
00213         );
00214     }
00215 
00216     ff_fft_calc_sse(&s->fft, z);
00217 
00218 #ifndef ARCH_X86_64
00219 #undef P1M1P1M1
00220 #define P1M1P1M1 "%3"
00221 #endif
00222 
00223     /* post rotation + reordering */
00224     for (k = 0; k < n4; k += 4) {
00225         asm (
00226             "movaps          %0, %%xmm0 \n\t"   // xmm0 = i1 r1 i0 r0: z
00227             "movaps     16+1*%0, %%xmm4 \n\t"   // xmm4 = i1 r1 i0 r0: z
00228             "movlps          %1, %%xmm1 \n\t"   // xmm1 = X  X  R1 R0: tcos
00229             "movlps      8+1*%1, %%xmm5 \n\t"   // xmm5 = X  X  R1 R0: tcos
00230             "movaps      %%xmm0, %%xmm3 \n\t"   // xmm3 = i1 r1 i0 r0
00231             "movaps      %%xmm4, %%xmm7 \n\t"   // xmm7 = i1 r1 i0 r0
00232             "movlps          %2, %%xmm2 \n\t"   // xmm2 = X  X  I1 I0: tsin
00233             "movlps      8+1*%2, %%xmm6 \n\t"   // xmm6 = X  X  I1 I0: tsin
00234             "shufps $160,%%xmm0, %%xmm0 \n\t"   // xmm0 = r1 r1 r0 r0
00235             "shufps $245,%%xmm3, %%xmm3 \n\t"   // xmm3 = i1 i1 i0 i0
00236             "shufps $160,%%xmm4, %%xmm4 \n\t"   // xmm4 = r1 r1 r0 r0
00237             "shufps $245,%%xmm7, %%xmm7 \n\t"   // xmm7 = i1 i1 i0 i0
00238             "unpcklps    %%xmm2, %%xmm1 \n\t"   // xmm1 = I1 R1 I0 R0
00239             "unpcklps    %%xmm6, %%xmm5 \n\t"   // xmm5 = I1 R1 I0 R0
00240             "movaps      %%xmm1, %%xmm2 \n\t"   // xmm2 = I1 R1 I0 R0
00241             "movaps      %%xmm5, %%xmm6 \n\t"   // xmm6 = I1 R1 I0 R0
00242             "xorps   "P1M1P1M1", %%xmm2 \n\t"   // xmm2 = -I1 R1 -I0 R0
00243             "mulps       %%xmm1, %%xmm0 \n\t"   // xmm0 = rI rR rI rR
00244             "xorps   "P1M1P1M1", %%xmm6 \n\t"   // xmm6 = -I1 R1 -I0 R0
00245             "mulps       %%xmm5, %%xmm4 \n\t"   // xmm4 = rI rR rI rR
00246             "shufps $177,%%xmm2, %%xmm2 \n\t"   // xmm2 = R1 -I1 R0 -I0
00247             "shufps $177,%%xmm6, %%xmm6 \n\t"   // xmm6 = R1 -I1 R0 -I0
00248             "mulps       %%xmm2, %%xmm3 \n\t"   // xmm3 = Ri -Ii Ri -Ii
00249             "mulps       %%xmm6, %%xmm7 \n\t"   // xmm7 = Ri -Ii Ri -Ii
00250             "addps       %%xmm3, %%xmm0 \n\t"   // xmm0 = result
00251             "addps       %%xmm7, %%xmm4 \n\t"   // xmm4 = result
00252             "movaps      %%xmm0, %0     \n\t"
00253             "movaps      %%xmm4, 16+1*%0\n\t"
00254             :"+m"(z[k])
00255             :"m"(tcos[k]), "m"(tsin[k])
00256 #ifndef ARCH_X86_64
00257              ,"m"(*p1m1p1m1)
00258 #endif
00259         );
00260     }
00261 }
00262 
00263 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
00264                        const FFTSample *input, FFTSample *tmp)
00265 {
00266     x86_reg k;
00267     long n8, n2, n;
00268     FFTComplex *z = (FFTComplex *)tmp;
00269 
00270     n = 1 << s->nbits;
00271     n2 = n >> 1;
00272     n8 = n >> 3;
00273 
00274     imdct_sse(s, input, tmp);
00275 
00276     /*
00277        Mnemonics:
00278        0 = z[k].re
00279        1 = z[k].im
00280        2 = z[k + 1].re
00281        3 = z[k + 1].im
00282        4 = z[-k - 2].re
00283        5 = z[-k - 2].im
00284        6 = z[-k - 1].re
00285        7 = z[-k - 1].im
00286     */
00287     k = 16-n;
00288     asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
00289     asm volatile(
00290         "1: \n\t"
00291         "movaps  -16(%4,%0), %%xmm1 \n\t"   // xmm1 = 4 5 6 7 = z[-2-k]
00292         "neg %0 \n\t"
00293         "movaps     (%4,%0), %%xmm0 \n\t"   // xmm0 = 0 1 2 3 = z[k]
00294         "xorps       %%xmm7, %%xmm0 \n\t"   // xmm0 = -0 -1 -2 -3
00295         "movaps      %%xmm0, %%xmm2 \n\t"   // xmm2 = -0 -1 -2 -3
00296         "shufps $141,%%xmm1, %%xmm0 \n\t"   // xmm0 = -1 -3 4 6
00297         "shufps $216,%%xmm1, %%xmm2 \n\t"   // xmm2 = -0 -2 5 7
00298         "shufps $156,%%xmm0, %%xmm0 \n\t"   // xmm0 = -1 6 -3 4 !
00299         "shufps $156,%%xmm2, %%xmm2 \n\t"   // xmm2 = -0 7 -2 5 !
00300         "movaps      %%xmm0, (%1,%0) \n\t"  // output[2*k]
00301         "movaps      %%xmm2, (%2,%0) \n\t"  // output[n2+2*k]
00302         "neg %0 \n\t"
00303         "shufps $27, %%xmm0, %%xmm0 \n\t"   // xmm0 = 4 -3 6 -1
00304         "xorps       %%xmm7, %%xmm0 \n\t"   // xmm0 = -4 3 -6 1 !
00305         "shufps $27, %%xmm2, %%xmm2 \n\t"   // xmm2 = 5 -2 7 -0 !
00306         "movaps      %%xmm0, -16(%2,%0) \n\t" // output[n2-4-2*k]
00307         "movaps      %%xmm2, -16(%3,%0) \n\t" // output[n-4-2*k]
00308         "add $16, %0 \n\t"
00309         "jle 1b \n\t"
00310         :"+r"(k)
00311         :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8)
00312         :"memory"
00313     );
00314 }
00315 
00316 void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
00317                        const FFTSample *input, FFTSample *tmp)
00318 {
00319     x86_reg j, k;
00320     long n8, n4, n;
00321     FFTComplex *z = (FFTComplex *)tmp;
00322 
00323     n = 1 << s->nbits;
00324     n4 = n >> 2;
00325     n8 = n >> 3;
00326 
00327     imdct_sse(s, input, tmp);
00328 
00329     j = -n;
00330     k = n-16;
00331     asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
00332     asm volatile(
00333         "1: \n\t"
00334         "movaps     (%3,%1), %%xmm0 \n\t"
00335         "movaps     (%3,%0), %%xmm1 \n\t"
00336         "xorps       %%xmm7, %%xmm0 \n\t"
00337         "movaps      %%xmm0, %%xmm2 \n\t"
00338         "shufps $141,%%xmm1, %%xmm0 \n\t"
00339         "shufps $216,%%xmm1, %%xmm2 \n\t"
00340         "shufps $54, %%xmm0, %%xmm0 \n\t"
00341         "shufps $156,%%xmm2, %%xmm2 \n\t"
00342         "xorps       %%xmm7, %%xmm0 \n\t"
00343         "movaps      %%xmm2, (%2,%1) \n\t"
00344         "movaps      %%xmm0, (%2,%0) \n\t"
00345         "sub $16, %1 \n\t"
00346         "add $16, %0 \n\t"
00347         "jl 1b \n\t"
00348         :"+r"(j), "+r"(k)
00349         :"r"(output+n4), "r"(z+n8)
00350         :"memory"
00351     );
00352 }
00353 

Generated on Fri Jan 9 17:44:27 2009 for libextractor by  doxygen 1.5.1