00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/x86_cpu.h"
00023 #include "libavcodec/dsputil.h"
00024
00025 static const int p1p1p1m1[4] __attribute__((aligned(16))) =
00026 { 0, 0, 0, 1 << 31 };
00027
00028 static const int p1p1m1p1[4] __attribute__((aligned(16))) =
00029 { 0, 0, 1 << 31, 0 };
00030
00031 static const int p1p1m1m1[4] __attribute__((aligned(16))) =
00032 { 0, 0, 1 << 31, 1 << 31 };
00033
00034 static const int p1m1p1m1[4] __attribute__((aligned(16))) =
00035 { 0, 1 << 31, 0, 1 << 31 };
00036
00037 static const int m1m1m1m1[4] __attribute__((aligned(16))) =
00038 { 1 << 31, 1 << 31, 1 << 31, 1 << 31 };
00039
00040 #if 0
00041 static void print_v4sf(const char *str, __m128 a)
00042 {
00043 float *p = (float *)&a;
00044 printf("%s: %f %f %f %f\n",
00045 str, p[0], p[1], p[2], p[3]);
00046 }
00047 #endif
00048
00049
00050 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
00051 {
00052 int ln = s->nbits;
00053 x86_reg i;
00054 long j;
00055 long nblocks, nloops;
00056 FFTComplex *p, *cptr;
00057
00058 asm volatile(
00059 "movaps %0, %%xmm4 \n\t"
00060 "movaps %1, %%xmm5 \n\t"
00061 ::"m"(*p1p1m1m1),
00062 "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1))
00063 );
00064
00065 i = 8 << ln;
00066 asm volatile(
00067 "1: \n\t"
00068 "sub $32, %0 \n\t"
00069
00070 "movaps (%0,%1), %%xmm0 \n\t"
00071 "movaps %%xmm0, %%xmm1 \n\t"
00072 "shufps $0x4E, %%xmm0, %%xmm0 \n\t"
00073 "xorps %%xmm4, %%xmm1 \n\t"
00074 "addps %%xmm1, %%xmm0 \n\t"
00075 "movaps 16(%0,%1), %%xmm2 \n\t"
00076 "movaps %%xmm2, %%xmm3 \n\t"
00077 "shufps $0x4E, %%xmm2, %%xmm2 \n\t"
00078 "xorps %%xmm4, %%xmm3 \n\t"
00079 "addps %%xmm3, %%xmm2 \n\t"
00080
00081
00082 "shufps $0xB4, %%xmm2, %%xmm2 \n\t"
00083 "xorps %%xmm5, %%xmm2 \n\t"
00084
00085 "movaps %%xmm0, %%xmm1 \n\t"
00086 "addps %%xmm2, %%xmm0 \n\t"
00087 "subps %%xmm2, %%xmm1 \n\t"
00088 "movaps %%xmm0, (%0,%1) \n\t"
00089 "movaps %%xmm1, 16(%0,%1) \n\t"
00090 "jg 1b \n\t"
00091 :"+r"(i)
00092 :"r"(z)
00093 );
00094
00095
00096 nblocks = 1 << (ln-3);
00097 nloops = 1 << 2;
00098 cptr = s->exptab1;
00099 do {
00100 p = z;
00101 j = nblocks;
00102 do {
00103 i = nloops*8;
00104 asm volatile(
00105 "1: \n\t"
00106 "sub $32, %0 \n\t"
00107 "movaps (%2,%0), %%xmm1 \n\t"
00108 "movaps (%1,%0), %%xmm0 \n\t"
00109 "movaps 16(%2,%0), %%xmm5 \n\t"
00110 "movaps 16(%1,%0), %%xmm4 \n\t"
00111 "movaps %%xmm1, %%xmm2 \n\t"
00112 "movaps %%xmm5, %%xmm6 \n\t"
00113 "shufps $0xA0, %%xmm1, %%xmm1 \n\t"
00114 "shufps $0xF5, %%xmm2, %%xmm2 \n\t"
00115 "shufps $0xA0, %%xmm5, %%xmm5 \n\t"
00116 "shufps $0xF5, %%xmm6, %%xmm6 \n\t"
00117 "mulps (%3,%0,2), %%xmm1 \n\t"
00118 "mulps 16(%3,%0,2), %%xmm2 \n\t"
00119 "mulps 32(%3,%0,2), %%xmm5 \n\t"
00120 "mulps 48(%3,%0,2), %%xmm6 \n\t"
00121 "addps %%xmm2, %%xmm1 \n\t"
00122 "addps %%xmm6, %%xmm5 \n\t"
00123 "movaps %%xmm0, %%xmm3 \n\t"
00124 "movaps %%xmm4, %%xmm7 \n\t"
00125 "addps %%xmm1, %%xmm0 \n\t"
00126 "subps %%xmm1, %%xmm3 \n\t"
00127 "addps %%xmm5, %%xmm4 \n\t"
00128 "subps %%xmm5, %%xmm7 \n\t"
00129 "movaps %%xmm0, (%1,%0) \n\t"
00130 "movaps %%xmm3, (%2,%0) \n\t"
00131 "movaps %%xmm4, 16(%1,%0) \n\t"
00132 "movaps %%xmm7, 16(%2,%0) \n\t"
00133 "jg 1b \n\t"
00134 :"+r"(i)
00135 :"r"(p), "r"(p + nloops), "r"(cptr)
00136 );
00137 p += nloops*2;
00138 } while (--j);
00139 cptr += nloops*2;
00140 nblocks >>= 1;
00141 nloops <<= 1;
00142 } while (nblocks != 0);
00143 }
00144
00145 static void imdct_sse(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
00146 {
00147 x86_reg k;
00148 long n4, n2, n;
00149 const uint16_t *revtab = s->fft.revtab;
00150 const FFTSample *tcos = s->tcos;
00151 const FFTSample *tsin = s->tsin;
00152 const FFTSample *in1, *in2;
00153 FFTComplex *z = (FFTComplex *)tmp;
00154
00155 n = 1 << s->nbits;
00156 n2 = n >> 1;
00157 n4 = n >> 2;
00158
00159 #ifdef ARCH_X86_64
00160 asm volatile ("movaps %0, %%xmm8\n\t"::"m"(*p1m1p1m1));
00161 #define P1M1P1M1 "%%xmm8"
00162 #else
00163 #define P1M1P1M1 "%4"
00164 #endif
00165
00166
00167 in1 = input;
00168 in2 = input + n2 - 4;
00169
00170
00171 for (k = 0; k < n4; k += 4) {
00172 asm volatile (
00173 "movaps %0, %%xmm0 \n\t"
00174 "movaps %1, %%xmm3 \n\t"
00175 "movaps -16+1*%0, %%xmm4 \n\t"
00176 "movaps 16+1*%1, %%xmm7 \n\t"
00177 "movlps %2, %%xmm1 \n\t"
00178 "movlps %3, %%xmm2 \n\t"
00179 "movlps 8+1*%2, %%xmm5 \n\t"
00180 "movlps 8+1*%3, %%xmm6 \n\t"
00181 "shufps $95, %%xmm0, %%xmm0 \n\t"
00182 "shufps $160,%%xmm3, %%xmm3 \n\t"
00183 "shufps $95, %%xmm4, %%xmm4 \n\t"
00184 "shufps $160,%%xmm7, %%xmm7 \n\t"
00185 "unpcklps %%xmm2, %%xmm1 \n\t"
00186 "unpcklps %%xmm6, %%xmm5 \n\t"
00187 "movaps %%xmm1, %%xmm2 \n\t"
00188 "movaps %%xmm5, %%xmm6 \n\t"
00189 "xorps "P1M1P1M1", %%xmm2 \n\t"
00190 "xorps "P1M1P1M1", %%xmm6 \n\t"
00191 "mulps %%xmm1, %%xmm0 \n\t"
00192 "mulps %%xmm5, %%xmm4 \n\t"
00193 "shufps $177,%%xmm2, %%xmm2 \n\t"
00194 "shufps $177,%%xmm6, %%xmm6 \n\t"
00195 "mulps %%xmm2, %%xmm3 \n\t"
00196 "mulps %%xmm6, %%xmm7 \n\t"
00197 "addps %%xmm3, %%xmm0 \n\t"
00198 "addps %%xmm7, %%xmm4 \n\t"
00199 ::"m"(in2[-2*k]), "m"(in1[2*k]),
00200 "m"(tcos[k]), "m"(tsin[k])
00201 #ifndef ARCH_X86_64
00202 ,"m"(*p1m1p1m1)
00203 #endif
00204 );
00205
00206 asm (
00207 "movlps %%xmm0, %0 \n\t"
00208 "movhps %%xmm0, %1 \n\t"
00209 "movlps %%xmm4, %2 \n\t"
00210 "movhps %%xmm4, %3 \n\t"
00211 :"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]]),
00212 "=m"(z[revtab[k + 2]]), "=m"(z[revtab[k + 3]])
00213 );
00214 }
00215
00216 ff_fft_calc_sse(&s->fft, z);
00217
00218 #ifndef ARCH_X86_64
00219 #undef P1M1P1M1
00220 #define P1M1P1M1 "%3"
00221 #endif
00222
00223
00224 for (k = 0; k < n4; k += 4) {
00225 asm (
00226 "movaps %0, %%xmm0 \n\t"
00227 "movaps 16+1*%0, %%xmm4 \n\t"
00228 "movlps %1, %%xmm1 \n\t"
00229 "movlps 8+1*%1, %%xmm5 \n\t"
00230 "movaps %%xmm0, %%xmm3 \n\t"
00231 "movaps %%xmm4, %%xmm7 \n\t"
00232 "movlps %2, %%xmm2 \n\t"
00233 "movlps 8+1*%2, %%xmm6 \n\t"
00234 "shufps $160,%%xmm0, %%xmm0 \n\t"
00235 "shufps $245,%%xmm3, %%xmm3 \n\t"
00236 "shufps $160,%%xmm4, %%xmm4 \n\t"
00237 "shufps $245,%%xmm7, %%xmm7 \n\t"
00238 "unpcklps %%xmm2, %%xmm1 \n\t"
00239 "unpcklps %%xmm6, %%xmm5 \n\t"
00240 "movaps %%xmm1, %%xmm2 \n\t"
00241 "movaps %%xmm5, %%xmm6 \n\t"
00242 "xorps "P1M1P1M1", %%xmm2 \n\t"
00243 "mulps %%xmm1, %%xmm0 \n\t"
00244 "xorps "P1M1P1M1", %%xmm6 \n\t"
00245 "mulps %%xmm5, %%xmm4 \n\t"
00246 "shufps $177,%%xmm2, %%xmm2 \n\t"
00247 "shufps $177,%%xmm6, %%xmm6 \n\t"
00248 "mulps %%xmm2, %%xmm3 \n\t"
00249 "mulps %%xmm6, %%xmm7 \n\t"
00250 "addps %%xmm3, %%xmm0 \n\t"
00251 "addps %%xmm7, %%xmm4 \n\t"
00252 "movaps %%xmm0, %0 \n\t"
00253 "movaps %%xmm4, 16+1*%0\n\t"
00254 :"+m"(z[k])
00255 :"m"(tcos[k]), "m"(tsin[k])
00256 #ifndef ARCH_X86_64
00257 ,"m"(*p1m1p1m1)
00258 #endif
00259 );
00260 }
00261 }
00262
00263 void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
00264 const FFTSample *input, FFTSample *tmp)
00265 {
00266 x86_reg k;
00267 long n8, n2, n;
00268 FFTComplex *z = (FFTComplex *)tmp;
00269
00270 n = 1 << s->nbits;
00271 n2 = n >> 1;
00272 n8 = n >> 3;
00273
00274 imdct_sse(s, input, tmp);
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287 k = 16-n;
00288 asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
00289 asm volatile(
00290 "1: \n\t"
00291 "movaps -16(%4,%0), %%xmm1 \n\t"
00292 "neg %0 \n\t"
00293 "movaps (%4,%0), %%xmm0 \n\t"
00294 "xorps %%xmm7, %%xmm0 \n\t"
00295 "movaps %%xmm0, %%xmm2 \n\t"
00296 "shufps $141,%%xmm1, %%xmm0 \n\t"
00297 "shufps $216,%%xmm1, %%xmm2 \n\t"
00298 "shufps $156,%%xmm0, %%xmm0 \n\t"
00299 "shufps $156,%%xmm2, %%xmm2 \n\t"
00300 "movaps %%xmm0, (%1,%0) \n\t"
00301 "movaps %%xmm2, (%2,%0) \n\t"
00302 "neg %0 \n\t"
00303 "shufps $27, %%xmm0, %%xmm0 \n\t"
00304 "xorps %%xmm7, %%xmm0 \n\t"
00305 "shufps $27, %%xmm2, %%xmm2 \n\t"
00306 "movaps %%xmm0, -16(%2,%0) \n\t"
00307 "movaps %%xmm2, -16(%3,%0) \n\t"
00308 "add $16, %0 \n\t"
00309 "jle 1b \n\t"
00310 :"+r"(k)
00311 :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8)
00312 :"memory"
00313 );
00314 }
00315
00316 void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
00317 const FFTSample *input, FFTSample *tmp)
00318 {
00319 x86_reg j, k;
00320 long n8, n4, n;
00321 FFTComplex *z = (FFTComplex *)tmp;
00322
00323 n = 1 << s->nbits;
00324 n4 = n >> 2;
00325 n8 = n >> 3;
00326
00327 imdct_sse(s, input, tmp);
00328
00329 j = -n;
00330 k = n-16;
00331 asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
00332 asm volatile(
00333 "1: \n\t"
00334 "movaps (%3,%1), %%xmm0 \n\t"
00335 "movaps (%3,%0), %%xmm1 \n\t"
00336 "xorps %%xmm7, %%xmm0 \n\t"
00337 "movaps %%xmm0, %%xmm2 \n\t"
00338 "shufps $141,%%xmm1, %%xmm0 \n\t"
00339 "shufps $216,%%xmm1, %%xmm2 \n\t"
00340 "shufps $54, %%xmm0, %%xmm0 \n\t"
00341 "shufps $156,%%xmm2, %%xmm2 \n\t"
00342 "xorps %%xmm7, %%xmm0 \n\t"
00343 "movaps %%xmm2, (%2,%1) \n\t"
00344 "movaps %%xmm0, (%2,%0) \n\t"
00345 "sub $16, %1 \n\t"
00346 "add $16, %0 \n\t"
00347 "jl 1b \n\t"
00348 :"+r"(j), "+r"(k)
00349 :"r"(output+n4), "r"(z+n8)
00350 :"memory"
00351 );
00352 }
00353