00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025
00026 static const int p1m1[2] __attribute__((aligned(8))) =
00027 { 0, 1 << 31 };
00028
00029 static const int m1p1[2] __attribute__((aligned(8))) =
00030 { 1 << 31, 0 };
00031
00032 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
00033 {
00034 int ln = s->nbits;
00035 long j;
00036 x86_reg i;
00037 long nblocks, nloops;
00038 FFTComplex *p, *cptr;
00039
00040 asm volatile(
00041
00042 "femms \n\t"
00043 "movq %0, %%mm7 \n\t"
00044 ::"m"(*(s->inverse ? m1p1 : p1m1))
00045 );
00046
00047 i = 8 << ln;
00048 asm volatile(
00049 "1: \n\t"
00050 "sub $32, %0 \n\t"
00051 "movq (%0,%1), %%mm0 \n\t"
00052 "movq 16(%0,%1), %%mm1 \n\t"
00053 "movq 8(%0,%1), %%mm2 \n\t"
00054 "movq 24(%0,%1), %%mm3 \n\t"
00055 "movq %%mm0, %%mm4 \n\t"
00056 "movq %%mm1, %%mm5 \n\t"
00057 "pfadd %%mm2, %%mm0 \n\t"
00058 "pfadd %%mm3, %%mm1 \n\t"
00059 "pfsub %%mm2, %%mm4 \n\t"
00060 "pfsub %%mm3, %%mm5 \n\t"
00061 "movq %%mm0, %%mm2 \n\t"
00062 "pswapd %%mm5, %%mm5 \n\t"
00063 "movq %%mm4, %%mm3 \n\t"
00064 "pxor %%mm7, %%mm5 \n\t"
00065 "pfadd %%mm1, %%mm0 \n\t"
00066 "pfadd %%mm5, %%mm4 \n\t"
00067 "pfsub %%mm1, %%mm2 \n\t"
00068 "pfsub %%mm5, %%mm3 \n\t"
00069 "movq %%mm0, (%0,%1) \n\t"
00070 "movq %%mm4, 8(%0,%1) \n\t"
00071 "movq %%mm2, 16(%0,%1) \n\t"
00072 "movq %%mm3, 24(%0,%1) \n\t"
00073 "jg 1b \n\t"
00074 :"+r"(i)
00075 :"r"(z)
00076 );
00077
00078
00079 nblocks = 1 << (ln-3);
00080 nloops = 1 << 2;
00081 cptr = s->exptab1;
00082 do {
00083 p = z;
00084 j = nblocks;
00085 do {
00086 i = nloops*8;
00087 asm volatile(
00088 "1: \n\t"
00089 "sub $16, %0 \n\t"
00090 "movq (%1,%0), %%mm0 \n\t"
00091 "movq 8(%1,%0), %%mm1 \n\t"
00092 "movq (%2,%0), %%mm2 \n\t"
00093 "movq 8(%2,%0), %%mm3 \n\t"
00094 "movq (%3,%0,2), %%mm4 \n\t"
00095 "movq 8(%3,%0,2), %%mm5 \n\t"
00096 "pswapd %%mm4, %%mm6 \n\t"
00097 "pswapd %%mm5, %%mm7 \n\t"
00098 "pfmul %%mm2, %%mm4 \n\t"
00099 "pfmul %%mm3, %%mm5 \n\t"
00100 "pfmul %%mm2, %%mm6 \n\t"
00101 "pfmul %%mm3, %%mm7 \n\t"
00102 "pfpnacc %%mm6, %%mm4 \n\t"
00103 "pfpnacc %%mm7, %%mm5 \n\t"
00104 "movq %%mm0, %%mm2 \n\t"
00105 "movq %%mm1, %%mm3 \n\t"
00106 "pfadd %%mm4, %%mm0 \n\t"
00107 "pfadd %%mm5, %%mm1 \n\t"
00108 "pfsub %%mm4, %%mm2 \n\t"
00109 "pfsub %%mm5, %%mm3 \n\t"
00110 "movq %%mm0, (%1,%0) \n\t"
00111 "movq %%mm1, 8(%1,%0) \n\t"
00112 "movq %%mm2, (%2,%0) \n\t"
00113 "movq %%mm3, 8(%2,%0) \n\t"
00114 "jg 1b \n\t"
00115 :"+r"(i)
00116 :"r"(p), "r"(p + nloops), "r"(cptr)
00117 );
00118 p += nloops*2;
00119 } while (--j);
00120 cptr += nloops*2;
00121 nblocks >>= 1;
00122 nloops <<= 1;
00123 } while (nblocks != 0);
00124 asm volatile("femms");
00125 }
00126
00127 static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
00128 {
00129 long n4, n2, n;
00130 x86_reg k;
00131 const uint16_t *revtab = s->fft.revtab;
00132 const FFTSample *tcos = s->tcos;
00133 const FFTSample *tsin = s->tsin;
00134 const FFTSample *in1, *in2;
00135 FFTComplex *z = (FFTComplex *)tmp;
00136
00137 n = 1 << s->nbits;
00138 n2 = n >> 1;
00139 n4 = n >> 2;
00140
00141
00142 in1 = input;
00143 in2 = input + n2 - 1;
00144 for(k = 0; k < n4; k++) {
00145
00146 asm volatile(
00147 "movd %0, %%mm0 \n\t"
00148 "movd %2, %%mm1 \n\t"
00149 "punpckldq %1, %%mm0 \n\t"
00150 "punpckldq %3, %%mm1 \n\t"
00151 "movq %%mm0, %%mm2 \n\t"
00152 "pfmul %%mm1, %%mm0 \n\t"
00153 "pswapd %%mm1, %%mm1 \n\t"
00154 "pfmul %%mm1, %%mm2 \n\t"
00155 "pfpnacc %%mm2, %%mm0 \n\t"
00156 ::"m"(in2[-2*k]), "m"(in1[2*k]),
00157 "m"(tcos[k]), "m"(tsin[k])
00158 );
00159 asm volatile(
00160 "movq %%mm0, %0 \n\t"
00161 :"=m"(z[revtab[k]])
00162 );
00163 }
00164
00165 ff_fft_calc(&s->fft, z);
00166
00167
00168 for(k = 0; k < n4; k++) {
00169 asm volatile(
00170 "movq %0, %%mm0 \n\t"
00171 "movd %1, %%mm1 \n\t"
00172 "punpckldq %2, %%mm1 \n\t"
00173 "movq %%mm0, %%mm2 \n\t"
00174 "pfmul %%mm1, %%mm0 \n\t"
00175 "pswapd %%mm1, %%mm1 \n\t"
00176 "pfmul %%mm1, %%mm2 \n\t"
00177 "pfpnacc %%mm2, %%mm0 \n\t"
00178 "movq %%mm0, %0 \n\t"
00179 :"+m"(z[k])
00180 :"m"(tcos[k]), "m"(tsin[k])
00181 );
00182 }
00183 }
00184
00185 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
00186 const FFTSample *input, FFTSample *tmp)
00187 {
00188 x86_reg k;
00189 long n8, n2, n;
00190 FFTComplex *z = (FFTComplex *)tmp;
00191
00192 n = 1 << s->nbits;
00193 n2 = n >> 1;
00194 n8 = n >> 3;
00195
00196 imdct_3dn2(s, input, tmp);
00197
00198 k = n-8;
00199 asm volatile("movd %0, %%mm7" ::"r"(1<<31));
00200 asm volatile(
00201 "1: \n\t"
00202 "movq (%4,%0), %%mm0 \n\t"
00203 "neg %0 \n\t"
00204 "pswapd -8(%4,%0), %%mm1 \n\t"
00205 "movq %%mm0, %%mm2 \n\t"
00206 "pxor %%mm7, %%mm2 \n\t"
00207 "punpckldq %%mm1, %%mm2 \n\t"
00208 "pswapd %%mm2, %%mm3 \n\t"
00209 "punpckhdq %%mm1, %%mm0 \n\t"
00210 "pswapd %%mm0, %%mm4 \n\t"
00211 "pxor %%mm7, %%mm0 \n\t"
00212 "pxor %%mm7, %%mm4 \n\t"
00213 "movq %%mm3, -8(%3,%0) \n\t"
00214 "movq %%mm4, -8(%2,%0) \n\t"
00215 "neg %0 \n\t"
00216 "movq %%mm0, (%1,%0) \n\t"
00217 "movq %%mm2, (%2,%0) \n\t"
00218 "sub $8, %0 \n\t"
00219 "jge 1b \n\t"
00220 :"+r"(k)
00221 :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8)
00222 :"memory"
00223 );
00224 asm volatile("femms");
00225 }
00226
00227 void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
00228 const FFTSample *input, FFTSample *tmp)
00229 {
00230 x86_reg j, k;
00231 long n8, n4, n;
00232 FFTComplex *z = (FFTComplex *)tmp;
00233
00234 n = 1 << s->nbits;
00235 n4 = n >> 2;
00236 n8 = n >> 3;
00237
00238 imdct_3dn2(s, input, tmp);
00239
00240 j = -n;
00241 k = n-8;
00242 asm volatile("movd %0, %%mm7" ::"r"(1<<31));
00243 asm volatile(
00244 "1: \n\t"
00245 "movq (%3,%1), %%mm0 \n\t"
00246 "pswapd (%3,%0), %%mm1 \n\t"
00247 "movq %%mm0, %%mm2 \n\t"
00248 "punpckldq %%mm1, %%mm0 \n\t"
00249 "punpckhdq %%mm2, %%mm1 \n\t"
00250 "pxor %%mm7, %%mm0 \n\t"
00251 "pxor %%mm7, %%mm1 \n\t"
00252 "movq %%mm0, (%2,%1) \n\t"
00253 "movq %%mm1, (%2,%0) \n\t"
00254 "sub $8, %1 \n\t"
00255 "add $8, %0 \n\t"
00256 "jl 1b \n\t"
00257 :"+r"(j), "+r"(k)
00258 :"r"(output+n4), "r"(z+n8)
00259 :"memory"
00260 );
00261 asm volatile("femms");
00262 }
00263