00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/x86_cpu.h"
00026 #include "libavcodec/dsputil.h"
00027 #include "libavcodec/h263.h"
00028 #include "libavcodec/mpegvideo.h"
00029 #include "libavcodec/simple_idct.h"
00030 #include "dsputil_mmx.h"
00031 #include "mmx.h"
00032 #include "vp3dsp_mmx.h"
00033 #include "vp3dsp_sse2.h"
00034 #include "idct_xvid.h"
00035
00036
00037
00038
00039 int mm_flags;
00040
00041
00042 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
00043 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
00044
00045 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
00046 {0x8000000080000000ULL, 0x8000000080000000ULL};
00047
00048 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
00049 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
00050 DECLARE_ALIGNED_16(const xmm_t, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
00051 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
00052 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
00053 DECLARE_ALIGNED_16(const xmm_t, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
00054 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
00055 DECLARE_ALIGNED_16(const xmm_t, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
00056 DECLARE_ALIGNED_16(const xmm_t, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
00057 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
00058 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
00059 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
00060 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
00061 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
00062
00063 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
00064 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
00065 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
00066 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
00067 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
00068 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
00069
00070 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
00071 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
00072
00073 #define JUMPALIGN() asm volatile (ASMALIGN(3)::)
00074 #define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::)
00075
00076 #define MOVQ_BFE(regd) \
00077 asm volatile ( \
00078 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
00079 "paddb %%" #regd ", %%" #regd " \n\t" ::)
00080
00081 #ifndef PIC
00082 #define MOVQ_BONE(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
00083 #define MOVQ_WTWO(regd) asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
00084 #else
00085
00086
00087 #define MOVQ_BONE(regd) \
00088 asm volatile ( \
00089 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00090 "psrlw $15, %%" #regd " \n\t" \
00091 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
00092
00093 #define MOVQ_WTWO(regd) \
00094 asm volatile ( \
00095 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00096 "psrlw $15, %%" #regd " \n\t" \
00097 "psllw $1, %%" #regd " \n\t"::)
00098
00099 #endif
00100
00101
00102
00103
00104 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00105 "movq " #rega ", " #regr " \n\t"\
00106 "pand " #regb ", " #regr " \n\t"\
00107 "pxor " #rega ", " #regb " \n\t"\
00108 "pand " #regfe "," #regb " \n\t"\
00109 "psrlq $1, " #regb " \n\t"\
00110 "paddb " #regb ", " #regr " \n\t"
00111
00112 #define PAVGB_MMX(rega, regb, regr, regfe) \
00113 "movq " #rega ", " #regr " \n\t"\
00114 "por " #regb ", " #regr " \n\t"\
00115 "pxor " #rega ", " #regb " \n\t"\
00116 "pand " #regfe "," #regb " \n\t"\
00117 "psrlq $1, " #regb " \n\t"\
00118 "psubb " #regb ", " #regr " \n\t"
00119
00120
00121 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
00122 "movq " #rega ", " #regr " \n\t"\
00123 "movq " #regc ", " #regp " \n\t"\
00124 "pand " #regb ", " #regr " \n\t"\
00125 "pand " #regd ", " #regp " \n\t"\
00126 "pxor " #rega ", " #regb " \n\t"\
00127 "pxor " #regc ", " #regd " \n\t"\
00128 "pand %%mm6, " #regb " \n\t"\
00129 "pand %%mm6, " #regd " \n\t"\
00130 "psrlq $1, " #regb " \n\t"\
00131 "psrlq $1, " #regd " \n\t"\
00132 "paddb " #regb ", " #regr " \n\t"\
00133 "paddb " #regd ", " #regp " \n\t"
00134
00135 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00136 "movq " #rega ", " #regr " \n\t"\
00137 "movq " #regc ", " #regp " \n\t"\
00138 "por " #regb ", " #regr " \n\t"\
00139 "por " #regd ", " #regp " \n\t"\
00140 "pxor " #rega ", " #regb " \n\t"\
00141 "pxor " #regc ", " #regd " \n\t"\
00142 "pand %%mm6, " #regb " \n\t"\
00143 "pand %%mm6, " #regd " \n\t"\
00144 "psrlq $1, " #regd " \n\t"\
00145 "psrlq $1, " #regb " \n\t"\
00146 "psubb " #regb ", " #regr " \n\t"\
00147 "psubb " #regd ", " #regp " \n\t"
00148
00149
00150
00151 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
00152 #define SET_RND MOVQ_WONE
00153 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00154 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
00155
00156 #include "dsputil_mmx_rnd.h"
00157
00158 #undef DEF
00159 #undef SET_RND
00160 #undef PAVGBP
00161 #undef PAVGB
00162
00163
00164
00165 #define DEF(x, y) x ## _ ## y ##_mmx
00166 #define SET_RND MOVQ_WTWO
00167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
00168 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
00169
00170 #include "dsputil_mmx_rnd.h"
00171
00172 #undef DEF
00173 #undef SET_RND
00174 #undef PAVGBP
00175 #undef PAVGB
00176
00177
00178
00179
00180 #define DEF(x) x ## _3dnow
00181 #define PAVGB "pavgusb"
00182
00183 #include "dsputil_mmx_avg.h"
00184
00185 #undef DEF
00186 #undef PAVGB
00187
00188
00189
00190
00191 #define DEF(x) x ## _mmx2
00192
00193
00194 #define PAVGB "pavgb"
00195
00196 #include "dsputil_mmx_avg.h"
00197
00198 #undef DEF
00199 #undef PAVGB
00200
00201 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
00202 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
00203 #define put_pixels16_mmx2 put_pixels16_mmx
00204 #define put_pixels8_mmx2 put_pixels8_mmx
00205 #define put_pixels4_mmx2 put_pixels4_mmx
00206 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
00207 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
00208 #define put_pixels16_3dnow put_pixels16_mmx
00209 #define put_pixels8_3dnow put_pixels8_mmx
00210 #define put_pixels4_3dnow put_pixels4_mmx
00211 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
00212 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
00213
00214
00215
00216
00217 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00218 {
00219 const DCTELEM *p;
00220 uint8_t *pix;
00221
00222
00223 p = block;
00224 pix = pixels;
00225
00226 asm volatile(
00227 "movq %3, %%mm0 \n\t"
00228 "movq 8%3, %%mm1 \n\t"
00229 "movq 16%3, %%mm2 \n\t"
00230 "movq 24%3, %%mm3 \n\t"
00231 "movq 32%3, %%mm4 \n\t"
00232 "movq 40%3, %%mm5 \n\t"
00233 "movq 48%3, %%mm6 \n\t"
00234 "movq 56%3, %%mm7 \n\t"
00235 "packuswb %%mm1, %%mm0 \n\t"
00236 "packuswb %%mm3, %%mm2 \n\t"
00237 "packuswb %%mm5, %%mm4 \n\t"
00238 "packuswb %%mm7, %%mm6 \n\t"
00239 "movq %%mm0, (%0) \n\t"
00240 "movq %%mm2, (%0, %1) \n\t"
00241 "movq %%mm4, (%0, %1, 2) \n\t"
00242 "movq %%mm6, (%0, %2) \n\t"
00243 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
00244 :"memory");
00245 pix += line_size*4;
00246 p += 32;
00247
00248
00249
00250
00251 asm volatile(
00252 "movq (%3), %%mm0 \n\t"
00253 "movq 8(%3), %%mm1 \n\t"
00254 "movq 16(%3), %%mm2 \n\t"
00255 "movq 24(%3), %%mm3 \n\t"
00256 "movq 32(%3), %%mm4 \n\t"
00257 "movq 40(%3), %%mm5 \n\t"
00258 "movq 48(%3), %%mm6 \n\t"
00259 "movq 56(%3), %%mm7 \n\t"
00260 "packuswb %%mm1, %%mm0 \n\t"
00261 "packuswb %%mm3, %%mm2 \n\t"
00262 "packuswb %%mm5, %%mm4 \n\t"
00263 "packuswb %%mm7, %%mm6 \n\t"
00264 "movq %%mm0, (%0) \n\t"
00265 "movq %%mm2, (%0, %1) \n\t"
00266 "movq %%mm4, (%0, %1, 2) \n\t"
00267 "movq %%mm6, (%0, %2) \n\t"
00268 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
00269 :"memory");
00270 }
00271
00272 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
00273 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
00274
00275 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00276 {
00277 int i;
00278
00279 movq_m2r(*vector128, mm1);
00280 for (i = 0; i < 8; i++) {
00281 movq_m2r(*(block), mm0);
00282 packsswb_m2r(*(block + 4), mm0);
00283 block += 8;
00284 paddb_r2r(mm1, mm0);
00285 movq_r2m(mm0, *pixels);
00286 pixels += line_size;
00287 }
00288 }
00289
00290 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00291 {
00292 const DCTELEM *p;
00293 uint8_t *pix;
00294 int i;
00295
00296
00297 p = block;
00298 pix = pixels;
00299 MOVQ_ZERO(mm7);
00300 i = 4;
00301 do {
00302 asm volatile(
00303 "movq (%2), %%mm0 \n\t"
00304 "movq 8(%2), %%mm1 \n\t"
00305 "movq 16(%2), %%mm2 \n\t"
00306 "movq 24(%2), %%mm3 \n\t"
00307 "movq %0, %%mm4 \n\t"
00308 "movq %1, %%mm6 \n\t"
00309 "movq %%mm4, %%mm5 \n\t"
00310 "punpcklbw %%mm7, %%mm4 \n\t"
00311 "punpckhbw %%mm7, %%mm5 \n\t"
00312 "paddsw %%mm4, %%mm0 \n\t"
00313 "paddsw %%mm5, %%mm1 \n\t"
00314 "movq %%mm6, %%mm5 \n\t"
00315 "punpcklbw %%mm7, %%mm6 \n\t"
00316 "punpckhbw %%mm7, %%mm5 \n\t"
00317 "paddsw %%mm6, %%mm2 \n\t"
00318 "paddsw %%mm5, %%mm3 \n\t"
00319 "packuswb %%mm1, %%mm0 \n\t"
00320 "packuswb %%mm3, %%mm2 \n\t"
00321 "movq %%mm0, %0 \n\t"
00322 "movq %%mm2, %1 \n\t"
00323 :"+m"(*pix), "+m"(*(pix+line_size))
00324 :"r"(p)
00325 :"memory");
00326 pix += line_size*2;
00327 p += 16;
00328 } while (--i);
00329 }
00330
00331 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00332 {
00333 asm volatile(
00334 "lea (%3, %3), %%"REG_a" \n\t"
00335 ASMALIGN(3)
00336 "1: \n\t"
00337 "movd (%1), %%mm0 \n\t"
00338 "movd (%1, %3), %%mm1 \n\t"
00339 "movd %%mm0, (%2) \n\t"
00340 "movd %%mm1, (%2, %3) \n\t"
00341 "add %%"REG_a", %1 \n\t"
00342 "add %%"REG_a", %2 \n\t"
00343 "movd (%1), %%mm0 \n\t"
00344 "movd (%1, %3), %%mm1 \n\t"
00345 "movd %%mm0, (%2) \n\t"
00346 "movd %%mm1, (%2, %3) \n\t"
00347 "add %%"REG_a", %1 \n\t"
00348 "add %%"REG_a", %2 \n\t"
00349 "subl $4, %0 \n\t"
00350 "jnz 1b \n\t"
00351 : "+g"(h), "+r" (pixels), "+r" (block)
00352 : "r"((x86_reg)line_size)
00353 : "%"REG_a, "memory"
00354 );
00355 }
00356
00357 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00358 {
00359 asm volatile(
00360 "lea (%3, %3), %%"REG_a" \n\t"
00361 ASMALIGN(3)
00362 "1: \n\t"
00363 "movq (%1), %%mm0 \n\t"
00364 "movq (%1, %3), %%mm1 \n\t"
00365 "movq %%mm0, (%2) \n\t"
00366 "movq %%mm1, (%2, %3) \n\t"
00367 "add %%"REG_a", %1 \n\t"
00368 "add %%"REG_a", %2 \n\t"
00369 "movq (%1), %%mm0 \n\t"
00370 "movq (%1, %3), %%mm1 \n\t"
00371 "movq %%mm0, (%2) \n\t"
00372 "movq %%mm1, (%2, %3) \n\t"
00373 "add %%"REG_a", %1 \n\t"
00374 "add %%"REG_a", %2 \n\t"
00375 "subl $4, %0 \n\t"
00376 "jnz 1b \n\t"
00377 : "+g"(h), "+r" (pixels), "+r" (block)
00378 : "r"((x86_reg)line_size)
00379 : "%"REG_a, "memory"
00380 );
00381 }
00382
00383 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00384 {
00385 asm volatile(
00386 "lea (%3, %3), %%"REG_a" \n\t"
00387 ASMALIGN(3)
00388 "1: \n\t"
00389 "movq (%1), %%mm0 \n\t"
00390 "movq 8(%1), %%mm4 \n\t"
00391 "movq (%1, %3), %%mm1 \n\t"
00392 "movq 8(%1, %3), %%mm5 \n\t"
00393 "movq %%mm0, (%2) \n\t"
00394 "movq %%mm4, 8(%2) \n\t"
00395 "movq %%mm1, (%2, %3) \n\t"
00396 "movq %%mm5, 8(%2, %3) \n\t"
00397 "add %%"REG_a", %1 \n\t"
00398 "add %%"REG_a", %2 \n\t"
00399 "movq (%1), %%mm0 \n\t"
00400 "movq 8(%1), %%mm4 \n\t"
00401 "movq (%1, %3), %%mm1 \n\t"
00402 "movq 8(%1, %3), %%mm5 \n\t"
00403 "movq %%mm0, (%2) \n\t"
00404 "movq %%mm4, 8(%2) \n\t"
00405 "movq %%mm1, (%2, %3) \n\t"
00406 "movq %%mm5, 8(%2, %3) \n\t"
00407 "add %%"REG_a", %1 \n\t"
00408 "add %%"REG_a", %2 \n\t"
00409 "subl $4, %0 \n\t"
00410 "jnz 1b \n\t"
00411 : "+g"(h), "+r" (pixels), "+r" (block)
00412 : "r"((x86_reg)line_size)
00413 : "%"REG_a, "memory"
00414 );
00415 }
00416
00417 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00418 {
00419 asm volatile(
00420 "1: \n\t"
00421 "movdqu (%1), %%xmm0 \n\t"
00422 "movdqu (%1,%3), %%xmm1 \n\t"
00423 "movdqu (%1,%3,2), %%xmm2 \n\t"
00424 "movdqu (%1,%4), %%xmm3 \n\t"
00425 "movdqa %%xmm0, (%2) \n\t"
00426 "movdqa %%xmm1, (%2,%3) \n\t"
00427 "movdqa %%xmm2, (%2,%3,2) \n\t"
00428 "movdqa %%xmm3, (%2,%4) \n\t"
00429 "subl $4, %0 \n\t"
00430 "lea (%1,%3,4), %1 \n\t"
00431 "lea (%2,%3,4), %2 \n\t"
00432 "jnz 1b \n\t"
00433 : "+g"(h), "+r" (pixels), "+r" (block)
00434 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00435 : "memory"
00436 );
00437 }
00438
00439 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00440 {
00441 asm volatile(
00442 "1: \n\t"
00443 "movdqu (%1), %%xmm0 \n\t"
00444 "movdqu (%1,%3), %%xmm1 \n\t"
00445 "movdqu (%1,%3,2), %%xmm2 \n\t"
00446 "movdqu (%1,%4), %%xmm3 \n\t"
00447 "pavgb (%2), %%xmm0 \n\t"
00448 "pavgb (%2,%3), %%xmm1 \n\t"
00449 "pavgb (%2,%3,2), %%xmm2 \n\t"
00450 "pavgb (%2,%4), %%xmm3 \n\t"
00451 "movdqa %%xmm0, (%2) \n\t"
00452 "movdqa %%xmm1, (%2,%3) \n\t"
00453 "movdqa %%xmm2, (%2,%3,2) \n\t"
00454 "movdqa %%xmm3, (%2,%4) \n\t"
00455 "subl $4, %0 \n\t"
00456 "lea (%1,%3,4), %1 \n\t"
00457 "lea (%2,%3,4), %2 \n\t"
00458 "jnz 1b \n\t"
00459 : "+g"(h), "+r" (pixels), "+r" (block)
00460 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00461 : "memory"
00462 );
00463 }
00464
00465 static void clear_blocks_mmx(DCTELEM *blocks)
00466 {
00467 asm volatile(
00468 "pxor %%mm7, %%mm7 \n\t"
00469 "mov $-128*6, %%"REG_a" \n\t"
00470 "1: \n\t"
00471 "movq %%mm7, (%0, %%"REG_a") \n\t"
00472 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
00473 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
00474 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
00475 "add $32, %%"REG_a" \n\t"
00476 " js 1b \n\t"
00477 : : "r" (((uint8_t *)blocks)+128*6)
00478 : "%"REG_a
00479 );
00480 }
00481
00482 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
00483 x86_reg i=0;
00484 asm volatile(
00485 "jmp 2f \n\t"
00486 "1: \n\t"
00487 "movq (%1, %0), %%mm0 \n\t"
00488 "movq (%2, %0), %%mm1 \n\t"
00489 "paddb %%mm0, %%mm1 \n\t"
00490 "movq %%mm1, (%2, %0) \n\t"
00491 "movq 8(%1, %0), %%mm0 \n\t"
00492 "movq 8(%2, %0), %%mm1 \n\t"
00493 "paddb %%mm0, %%mm1 \n\t"
00494 "movq %%mm1, 8(%2, %0) \n\t"
00495 "add $16, %0 \n\t"
00496 "2: \n\t"
00497 "cmp %3, %0 \n\t"
00498 " js 1b \n\t"
00499 : "+r" (i)
00500 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
00501 );
00502 for(; i<w; i++)
00503 dst[i+0] += src[i+0];
00504 }
00505
00506 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
00507 x86_reg i=0;
00508 asm volatile(
00509 "jmp 2f \n\t"
00510 "1: \n\t"
00511 "movq (%2, %0), %%mm0 \n\t"
00512 "movq 8(%2, %0), %%mm1 \n\t"
00513 "paddb (%3, %0), %%mm0 \n\t"
00514 "paddb 8(%3, %0), %%mm1 \n\t"
00515 "movq %%mm0, (%1, %0) \n\t"
00516 "movq %%mm1, 8(%1, %0) \n\t"
00517 "add $16, %0 \n\t"
00518 "2: \n\t"
00519 "cmp %4, %0 \n\t"
00520 " js 1b \n\t"
00521 : "+r" (i)
00522 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
00523 );
00524 for(; i<w; i++)
00525 dst[i] = src1[i] + src2[i];
00526 }
00527
00528 #define H263_LOOP_FILTER \
00529 "pxor %%mm7, %%mm7 \n\t"\
00530 "movq %0, %%mm0 \n\t"\
00531 "movq %0, %%mm1 \n\t"\
00532 "movq %3, %%mm2 \n\t"\
00533 "movq %3, %%mm3 \n\t"\
00534 "punpcklbw %%mm7, %%mm0 \n\t"\
00535 "punpckhbw %%mm7, %%mm1 \n\t"\
00536 "punpcklbw %%mm7, %%mm2 \n\t"\
00537 "punpckhbw %%mm7, %%mm3 \n\t"\
00538 "psubw %%mm2, %%mm0 \n\t"\
00539 "psubw %%mm3, %%mm1 \n\t"\
00540 "movq %1, %%mm2 \n\t"\
00541 "movq %1, %%mm3 \n\t"\
00542 "movq %2, %%mm4 \n\t"\
00543 "movq %2, %%mm5 \n\t"\
00544 "punpcklbw %%mm7, %%mm2 \n\t"\
00545 "punpckhbw %%mm7, %%mm3 \n\t"\
00546 "punpcklbw %%mm7, %%mm4 \n\t"\
00547 "punpckhbw %%mm7, %%mm5 \n\t"\
00548 "psubw %%mm2, %%mm4 \n\t"\
00549 "psubw %%mm3, %%mm5 \n\t"\
00550 "psllw $2, %%mm4 \n\t"\
00551 "psllw $2, %%mm5 \n\t"\
00552 "paddw %%mm0, %%mm4 \n\t"\
00553 "paddw %%mm1, %%mm5 \n\t"\
00554 "pxor %%mm6, %%mm6 \n\t"\
00555 "pcmpgtw %%mm4, %%mm6 \n\t"\
00556 "pcmpgtw %%mm5, %%mm7 \n\t"\
00557 "pxor %%mm6, %%mm4 \n\t"\
00558 "pxor %%mm7, %%mm5 \n\t"\
00559 "psubw %%mm6, %%mm4 \n\t"\
00560 "psubw %%mm7, %%mm5 \n\t"\
00561 "psrlw $3, %%mm4 \n\t"\
00562 "psrlw $3, %%mm5 \n\t"\
00563 "packuswb %%mm5, %%mm4 \n\t"\
00564 "packsswb %%mm7, %%mm6 \n\t"\
00565 "pxor %%mm7, %%mm7 \n\t"\
00566 "movd %4, %%mm2 \n\t"\
00567 "punpcklbw %%mm2, %%mm2 \n\t"\
00568 "punpcklbw %%mm2, %%mm2 \n\t"\
00569 "punpcklbw %%mm2, %%mm2 \n\t"\
00570 "psubusb %%mm4, %%mm2 \n\t"\
00571 "movq %%mm2, %%mm3 \n\t"\
00572 "psubusb %%mm4, %%mm3 \n\t"\
00573 "psubb %%mm3, %%mm2 \n\t"\
00574 "movq %1, %%mm3 \n\t"\
00575 "movq %2, %%mm4 \n\t"\
00576 "pxor %%mm6, %%mm3 \n\t"\
00577 "pxor %%mm6, %%mm4 \n\t"\
00578 "paddusb %%mm2, %%mm3 \n\t"\
00579 "psubusb %%mm2, %%mm4 \n\t"\
00580 "pxor %%mm6, %%mm3 \n\t"\
00581 "pxor %%mm6, %%mm4 \n\t"\
00582 "paddusb %%mm2, %%mm2 \n\t"\
00583 "packsswb %%mm1, %%mm0 \n\t"\
00584 "pcmpgtb %%mm0, %%mm7 \n\t"\
00585 "pxor %%mm7, %%mm0 \n\t"\
00586 "psubb %%mm7, %%mm0 \n\t"\
00587 "movq %%mm0, %%mm1 \n\t"\
00588 "psubusb %%mm2, %%mm0 \n\t"\
00589 "psubb %%mm0, %%mm1 \n\t"\
00590 "pand %5, %%mm1 \n\t"\
00591 "psrlw $2, %%mm1 \n\t"\
00592 "pxor %%mm7, %%mm1 \n\t"\
00593 "psubb %%mm7, %%mm1 \n\t"\
00594 "movq %0, %%mm5 \n\t"\
00595 "movq %3, %%mm6 \n\t"\
00596 "psubb %%mm1, %%mm5 \n\t"\
00597 "paddb %%mm1, %%mm6 \n\t"
00598
00599 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00600 if(ENABLE_ANY_H263) {
00601 const int strength= ff_h263_loop_filter_strength[qscale];
00602
00603 asm volatile(
00604
00605 H263_LOOP_FILTER
00606
00607 "movq %%mm3, %1 \n\t"
00608 "movq %%mm4, %2 \n\t"
00609 "movq %%mm5, %0 \n\t"
00610 "movq %%mm6, %3 \n\t"
00611 : "+m" (*(uint64_t*)(src - 2*stride)),
00612 "+m" (*(uint64_t*)(src - 1*stride)),
00613 "+m" (*(uint64_t*)(src + 0*stride)),
00614 "+m" (*(uint64_t*)(src + 1*stride))
00615 : "g" (2*strength), "m"(ff_pb_FC)
00616 );
00617 }
00618 }
00619
00620 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
00621 asm volatile(
00622 "movd %4, %%mm0 \n\t"
00623 "movd %5, %%mm1 \n\t"
00624 "movd %6, %%mm2 \n\t"
00625 "movd %7, %%mm3 \n\t"
00626 "punpcklbw %%mm1, %%mm0 \n\t"
00627 "punpcklbw %%mm3, %%mm2 \n\t"
00628 "movq %%mm0, %%mm1 \n\t"
00629 "punpcklwd %%mm2, %%mm0 \n\t"
00630 "punpckhwd %%mm2, %%mm1 \n\t"
00631 "movd %%mm0, %0 \n\t"
00632 "punpckhdq %%mm0, %%mm0 \n\t"
00633 "movd %%mm0, %1 \n\t"
00634 "movd %%mm1, %2 \n\t"
00635 "punpckhdq %%mm1, %%mm1 \n\t"
00636 "movd %%mm1, %3 \n\t"
00637
00638 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
00639 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
00640 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
00641 "=m" (*(uint32_t*)(dst + 3*dst_stride))
00642 : "m" (*(uint32_t*)(src + 0*src_stride)),
00643 "m" (*(uint32_t*)(src + 1*src_stride)),
00644 "m" (*(uint32_t*)(src + 2*src_stride)),
00645 "m" (*(uint32_t*)(src + 3*src_stride))
00646 );
00647 }
00648
00649 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00650 if(ENABLE_ANY_H263) {
00651 const int strength= ff_h263_loop_filter_strength[qscale];
00652 DECLARE_ALIGNED(8, uint64_t, temp[4]);
00653 uint8_t *btemp= (uint8_t*)temp;
00654
00655 src -= 2;
00656
00657 transpose4x4(btemp , src , 8, stride);
00658 transpose4x4(btemp+4, src + 4*stride, 8, stride);
00659 asm volatile(
00660 H263_LOOP_FILTER
00661
00662 : "+m" (temp[0]),
00663 "+m" (temp[1]),
00664 "+m" (temp[2]),
00665 "+m" (temp[3])
00666 : "g" (2*strength), "m"(ff_pb_FC)
00667 );
00668
00669 asm volatile(
00670 "movq %%mm5, %%mm1 \n\t"
00671 "movq %%mm4, %%mm0 \n\t"
00672 "punpcklbw %%mm3, %%mm5 \n\t"
00673 "punpcklbw %%mm6, %%mm4 \n\t"
00674 "punpckhbw %%mm3, %%mm1 \n\t"
00675 "punpckhbw %%mm6, %%mm0 \n\t"
00676 "movq %%mm5, %%mm3 \n\t"
00677 "movq %%mm1, %%mm6 \n\t"
00678 "punpcklwd %%mm4, %%mm5 \n\t"
00679 "punpcklwd %%mm0, %%mm1 \n\t"
00680 "punpckhwd %%mm4, %%mm3 \n\t"
00681 "punpckhwd %%mm0, %%mm6 \n\t"
00682 "movd %%mm5, (%0) \n\t"
00683 "punpckhdq %%mm5, %%mm5 \n\t"
00684 "movd %%mm5, (%0,%2) \n\t"
00685 "movd %%mm3, (%0,%2,2) \n\t"
00686 "punpckhdq %%mm3, %%mm3 \n\t"
00687 "movd %%mm3, (%0,%3) \n\t"
00688 "movd %%mm1, (%1) \n\t"
00689 "punpckhdq %%mm1, %%mm1 \n\t"
00690 "movd %%mm1, (%1,%2) \n\t"
00691 "movd %%mm6, (%1,%2,2) \n\t"
00692 "punpckhdq %%mm6, %%mm6 \n\t"
00693 "movd %%mm6, (%1,%3) \n\t"
00694 :: "r" (src),
00695 "r" (src + 4*stride),
00696 "r" ((x86_reg) stride ),
00697 "r" ((x86_reg)(3*stride))
00698 );
00699 }
00700 }
00701
00702
00703
00704 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
00705 {
00706 uint8_t *ptr, *last_line;
00707 int i;
00708
00709 last_line = buf + (height - 1) * wrap;
00710
00711 ptr = buf;
00712 if(w==8)
00713 {
00714 asm volatile(
00715 "1: \n\t"
00716 "movd (%0), %%mm0 \n\t"
00717 "punpcklbw %%mm0, %%mm0 \n\t"
00718 "punpcklwd %%mm0, %%mm0 \n\t"
00719 "punpckldq %%mm0, %%mm0 \n\t"
00720 "movq %%mm0, -8(%0) \n\t"
00721 "movq -8(%0, %2), %%mm1 \n\t"
00722 "punpckhbw %%mm1, %%mm1 \n\t"
00723 "punpckhwd %%mm1, %%mm1 \n\t"
00724 "punpckhdq %%mm1, %%mm1 \n\t"
00725 "movq %%mm1, (%0, %2) \n\t"
00726 "add %1, %0 \n\t"
00727 "cmp %3, %0 \n\t"
00728 " jb 1b \n\t"
00729 : "+r" (ptr)
00730 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00731 );
00732 }
00733 else
00734 {
00735 asm volatile(
00736 "1: \n\t"
00737 "movd (%0), %%mm0 \n\t"
00738 "punpcklbw %%mm0, %%mm0 \n\t"
00739 "punpcklwd %%mm0, %%mm0 \n\t"
00740 "punpckldq %%mm0, %%mm0 \n\t"
00741 "movq %%mm0, -8(%0) \n\t"
00742 "movq %%mm0, -16(%0) \n\t"
00743 "movq -8(%0, %2), %%mm1 \n\t"
00744 "punpckhbw %%mm1, %%mm1 \n\t"
00745 "punpckhwd %%mm1, %%mm1 \n\t"
00746 "punpckhdq %%mm1, %%mm1 \n\t"
00747 "movq %%mm1, (%0, %2) \n\t"
00748 "movq %%mm1, 8(%0, %2) \n\t"
00749 "add %1, %0 \n\t"
00750 "cmp %3, %0 \n\t"
00751 " jb 1b \n\t"
00752 : "+r" (ptr)
00753 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00754 );
00755 }
00756
00757 for(i=0;i<w;i+=4) {
00758
00759 ptr= buf - (i + 1) * wrap - w;
00760 asm volatile(
00761 "1: \n\t"
00762 "movq (%1, %0), %%mm0 \n\t"
00763 "movq %%mm0, (%0) \n\t"
00764 "movq %%mm0, (%0, %2) \n\t"
00765 "movq %%mm0, (%0, %2, 2) \n\t"
00766 "movq %%mm0, (%0, %3) \n\t"
00767 "add $8, %0 \n\t"
00768 "cmp %4, %0 \n\t"
00769 " jb 1b \n\t"
00770 : "+r" (ptr)
00771 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
00772 );
00773 ptr= last_line + (i + 1) * wrap - w;
00774 asm volatile(
00775 "1: \n\t"
00776 "movq (%1, %0), %%mm0 \n\t"
00777 "movq %%mm0, (%0) \n\t"
00778 "movq %%mm0, (%0, %2) \n\t"
00779 "movq %%mm0, (%0, %2, 2) \n\t"
00780 "movq %%mm0, (%0, %3) \n\t"
00781 "add $8, %0 \n\t"
00782 "cmp %4, %0 \n\t"
00783 " jb 1b \n\t"
00784 : "+r" (ptr)
00785 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
00786 );
00787 }
00788 }
00789
00790 #define PAETH(cpu, abs3)\
00791 void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
00792 {\
00793 x86_reg i = -bpp;\
00794 x86_reg end = w-3;\
00795 asm volatile(\
00796 "pxor %%mm7, %%mm7 \n"\
00797 "movd (%1,%0), %%mm0 \n"\
00798 "movd (%2,%0), %%mm1 \n"\
00799 "punpcklbw %%mm7, %%mm0 \n"\
00800 "punpcklbw %%mm7, %%mm1 \n"\
00801 "add %4, %0 \n"\
00802 "1: \n"\
00803 "movq %%mm1, %%mm2 \n"\
00804 "movd (%2,%0), %%mm1 \n"\
00805 "movq %%mm2, %%mm3 \n"\
00806 "punpcklbw %%mm7, %%mm1 \n"\
00807 "movq %%mm2, %%mm4 \n"\
00808 "psubw %%mm1, %%mm3 \n"\
00809 "psubw %%mm0, %%mm4 \n"\
00810 "movq %%mm3, %%mm5 \n"\
00811 "paddw %%mm4, %%mm5 \n"\
00812 abs3\
00813 "movq %%mm4, %%mm6 \n"\
00814 "pminsw %%mm5, %%mm6 \n"\
00815 "pcmpgtw %%mm6, %%mm3 \n"\
00816 "pcmpgtw %%mm5, %%mm4 \n"\
00817 "movq %%mm4, %%mm6 \n"\
00818 "pand %%mm3, %%mm4 \n"\
00819 "pandn %%mm3, %%mm6 \n"\
00820 "pandn %%mm0, %%mm3 \n"\
00821 "movd (%3,%0), %%mm0 \n"\
00822 "pand %%mm1, %%mm6 \n"\
00823 "pand %%mm4, %%mm2 \n"\
00824 "punpcklbw %%mm7, %%mm0 \n"\
00825 "movq %6, %%mm5 \n"\
00826 "paddw %%mm6, %%mm0 \n"\
00827 "paddw %%mm2, %%mm3 \n"\
00828 "paddw %%mm3, %%mm0 \n"\
00829 "pand %%mm5, %%mm0 \n"\
00830 "movq %%mm0, %%mm3 \n"\
00831 "packuswb %%mm3, %%mm3 \n"\
00832 "movd %%mm3, (%1,%0) \n"\
00833 "add %4, %0 \n"\
00834 "cmp %5, %0 \n"\
00835 "jle 1b \n"\
00836 :"+r"(i)\
00837 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
00838 "m"(ff_pw_255)\
00839 :"memory"\
00840 );\
00841 }
00842
00843 #define ABS3_MMX2\
00844 "psubw %%mm5, %%mm7 \n"\
00845 "pmaxsw %%mm7, %%mm5 \n"\
00846 "pxor %%mm6, %%mm6 \n"\
00847 "pxor %%mm7, %%mm7 \n"\
00848 "psubw %%mm3, %%mm6 \n"\
00849 "psubw %%mm4, %%mm7 \n"\
00850 "pmaxsw %%mm6, %%mm3 \n"\
00851 "pmaxsw %%mm7, %%mm4 \n"\
00852 "pxor %%mm7, %%mm7 \n"
00853
00854 #define ABS3_SSSE3\
00855 "pabsw %%mm3, %%mm3 \n"\
00856 "pabsw %%mm4, %%mm4 \n"\
00857 "pabsw %%mm5, %%mm5 \n"
00858
00859 PAETH(mmx2, ABS3_MMX2)
00860 #ifdef HAVE_SSSE3
00861 PAETH(ssse3, ABS3_SSSE3)
00862 #endif
00863
00864 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
00865 "paddw " #m4 ", " #m3 " \n\t" \
00866 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
00867 "pmullw " #m3 ", %%mm4 \n\t" \
00868 "movq "#in7", " #m3 " \n\t" \
00869 "movq "#in0", %%mm5 \n\t" \
00870 "paddw " #m3 ", %%mm5 \n\t" \
00871 "psubw %%mm5, %%mm4 \n\t" \
00872 "movq "#in1", %%mm5 \n\t" \
00873 "movq "#in2", %%mm6 \n\t" \
00874 "paddw " #m6 ", %%mm5 \n\t" \
00875 "paddw " #m5 ", %%mm6 \n\t" \
00876 "paddw %%mm6, %%mm6 \n\t" \
00877 "psubw %%mm6, %%mm5 \n\t" \
00878 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
00879 "paddw " #rnd ", %%mm4 \n\t" \
00880 "paddw %%mm4, %%mm5 \n\t" \
00881 "psraw $5, %%mm5 \n\t"\
00882 "packuswb %%mm5, %%mm5 \n\t"\
00883 OP(%%mm5, out, %%mm7, d)
00884
00885 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
00886 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00887 uint64_t temp;\
00888 \
00889 asm volatile(\
00890 "pxor %%mm7, %%mm7 \n\t"\
00891 "1: \n\t"\
00892 "movq (%0), %%mm0 \n\t" \
00893 "movq %%mm0, %%mm1 \n\t" \
00894 "movq %%mm0, %%mm2 \n\t" \
00895 "punpcklbw %%mm7, %%mm0 \n\t" \
00896 "punpckhbw %%mm7, %%mm1 \n\t" \
00897 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
00898 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
00899 "movq %%mm2, %%mm3 \n\t" \
00900 "movq %%mm2, %%mm4 \n\t" \
00901 "psllq $8, %%mm2 \n\t" \
00902 "psllq $16, %%mm3 \n\t" \
00903 "psllq $24, %%mm4 \n\t" \
00904 "punpckhbw %%mm7, %%mm2 \n\t" \
00905 "punpckhbw %%mm7, %%mm3 \n\t" \
00906 "punpckhbw %%mm7, %%mm4 \n\t" \
00907 "paddw %%mm3, %%mm5 \n\t" \
00908 "paddw %%mm2, %%mm6 \n\t" \
00909 "paddw %%mm5, %%mm5 \n\t" \
00910 "psubw %%mm5, %%mm6 \n\t" \
00911 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
00912 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
00913 "paddw %%mm4, %%mm0 \n\t" \
00914 "paddw %%mm1, %%mm5 \n\t" \
00915 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
00916 "psubw %%mm5, %%mm0 \n\t" \
00917 "paddw %6, %%mm6 \n\t"\
00918 "paddw %%mm6, %%mm0 \n\t" \
00919 "psraw $5, %%mm0 \n\t"\
00920 "movq %%mm0, %5 \n\t"\
00921 \
00922 \
00923 "movq 5(%0), %%mm0 \n\t" \
00924 "movq %%mm0, %%mm5 \n\t" \
00925 "movq %%mm0, %%mm6 \n\t" \
00926 "psrlq $8, %%mm0 \n\t" \
00927 "psrlq $16, %%mm5 \n\t" \
00928 "punpcklbw %%mm7, %%mm0 \n\t" \
00929 "punpcklbw %%mm7, %%mm5 \n\t" \
00930 "paddw %%mm0, %%mm2 \n\t" \
00931 "paddw %%mm5, %%mm3 \n\t" \
00932 "paddw %%mm2, %%mm2 \n\t" \
00933 "psubw %%mm2, %%mm3 \n\t" \
00934 "movq %%mm6, %%mm2 \n\t" \
00935 "psrlq $24, %%mm6 \n\t" \
00936 "punpcklbw %%mm7, %%mm2 \n\t" \
00937 "punpcklbw %%mm7, %%mm6 \n\t" \
00938 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
00939 "paddw %%mm2, %%mm1 \n\t" \
00940 "paddw %%mm6, %%mm4 \n\t" \
00941 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
00942 "psubw %%mm4, %%mm3 \n\t" \
00943 "paddw %6, %%mm1 \n\t"\
00944 "paddw %%mm1, %%mm3 \n\t" \
00945 "psraw $5, %%mm3 \n\t"\
00946 "movq %5, %%mm1 \n\t"\
00947 "packuswb %%mm3, %%mm1 \n\t"\
00948 OP_MMX2(%%mm1, (%1),%%mm4, q)\
00949 \
00950 \
00951 "movq 9(%0), %%mm1 \n\t" \
00952 "movq %%mm1, %%mm4 \n\t" \
00953 "movq %%mm1, %%mm3 \n\t" \
00954 "psrlq $8, %%mm1 \n\t" \
00955 "psrlq $16, %%mm4 \n\t" \
00956 "punpcklbw %%mm7, %%mm1 \n\t" \
00957 "punpcklbw %%mm7, %%mm4 \n\t" \
00958 "paddw %%mm1, %%mm5 \n\t" \
00959 "paddw %%mm4, %%mm0 \n\t" \
00960 "paddw %%mm5, %%mm5 \n\t" \
00961 "psubw %%mm5, %%mm0 \n\t" \
00962 "movq %%mm3, %%mm5 \n\t" \
00963 "psrlq $24, %%mm3 \n\t" \
00964 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
00965 "punpcklbw %%mm7, %%mm3 \n\t" \
00966 "paddw %%mm3, %%mm2 \n\t" \
00967 "psubw %%mm2, %%mm0 \n\t" \
00968 "movq %%mm5, %%mm2 \n\t" \
00969 "punpcklbw %%mm7, %%mm2 \n\t" \
00970 "punpckhbw %%mm7, %%mm5 \n\t" \
00971 "paddw %%mm2, %%mm6 \n\t" \
00972 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
00973 "paddw %6, %%mm0 \n\t"\
00974 "paddw %%mm6, %%mm0 \n\t" \
00975 "psraw $5, %%mm0 \n\t"\
00976 \
00977 \
00978 "paddw %%mm5, %%mm3 \n\t" \
00979 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
00980 "paddw %%mm4, %%mm6 \n\t" \
00981 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
00982 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
00983 "paddw %%mm1, %%mm4 \n\t" \
00984 "paddw %%mm2, %%mm5 \n\t" \
00985 "paddw %%mm6, %%mm6 \n\t" \
00986 "psubw %%mm6, %%mm4 \n\t" \
00987 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
00988 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
00989 "psubw %%mm5, %%mm3 \n\t" \
00990 "paddw %6, %%mm4 \n\t"\
00991 "paddw %%mm3, %%mm4 \n\t" \
00992 "psraw $5, %%mm4 \n\t"\
00993 "packuswb %%mm4, %%mm0 \n\t"\
00994 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
00995 \
00996 "add %3, %0 \n\t"\
00997 "add %4, %1 \n\t"\
00998 "decl %2 \n\t"\
00999 " jnz 1b \n\t"\
01000 : "+a"(src), "+c"(dst), "+D"(h)\
01001 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(temp), "m"(ROUNDER)\
01002 : "memory"\
01003 );\
01004 }\
01005 \
01006 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01007 int i;\
01008 int16_t temp[16];\
01009 \
01010 for(i=0; i<h; i++)\
01011 {\
01012 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01013 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01014 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01015 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01016 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01017 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
01018 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
01019 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
01020 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
01021 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
01022 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
01023 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
01024 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
01025 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
01026 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
01027 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
01028 asm volatile(\
01029 "movq (%0), %%mm0 \n\t"\
01030 "movq 8(%0), %%mm1 \n\t"\
01031 "paddw %2, %%mm0 \n\t"\
01032 "paddw %2, %%mm1 \n\t"\
01033 "psraw $5, %%mm0 \n\t"\
01034 "psraw $5, %%mm1 \n\t"\
01035 "packuswb %%mm1, %%mm0 \n\t"\
01036 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01037 "movq 16(%0), %%mm0 \n\t"\
01038 "movq 24(%0), %%mm1 \n\t"\
01039 "paddw %2, %%mm0 \n\t"\
01040 "paddw %2, %%mm1 \n\t"\
01041 "psraw $5, %%mm0 \n\t"\
01042 "psraw $5, %%mm1 \n\t"\
01043 "packuswb %%mm1, %%mm0 \n\t"\
01044 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
01045 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01046 : "memory"\
01047 );\
01048 dst+=dstStride;\
01049 src+=srcStride;\
01050 }\
01051 }\
01052 \
01053 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01054 asm volatile(\
01055 "pxor %%mm7, %%mm7 \n\t"\
01056 "1: \n\t"\
01057 "movq (%0), %%mm0 \n\t" \
01058 "movq %%mm0, %%mm1 \n\t" \
01059 "movq %%mm0, %%mm2 \n\t" \
01060 "punpcklbw %%mm7, %%mm0 \n\t" \
01061 "punpckhbw %%mm7, %%mm1 \n\t" \
01062 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01063 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01064 "movq %%mm2, %%mm3 \n\t" \
01065 "movq %%mm2, %%mm4 \n\t" \
01066 "psllq $8, %%mm2 \n\t" \
01067 "psllq $16, %%mm3 \n\t" \
01068 "psllq $24, %%mm4 \n\t" \
01069 "punpckhbw %%mm7, %%mm2 \n\t" \
01070 "punpckhbw %%mm7, %%mm3 \n\t" \
01071 "punpckhbw %%mm7, %%mm4 \n\t" \
01072 "paddw %%mm3, %%mm5 \n\t" \
01073 "paddw %%mm2, %%mm6 \n\t" \
01074 "paddw %%mm5, %%mm5 \n\t" \
01075 "psubw %%mm5, %%mm6 \n\t" \
01076 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01077 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01078 "paddw %%mm4, %%mm0 \n\t" \
01079 "paddw %%mm1, %%mm5 \n\t" \
01080 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01081 "psubw %%mm5, %%mm0 \n\t" \
01082 "paddw %5, %%mm6 \n\t"\
01083 "paddw %%mm6, %%mm0 \n\t" \
01084 "psraw $5, %%mm0 \n\t"\
01085 \
01086 \
01087 "movd 5(%0), %%mm5 \n\t" \
01088 "punpcklbw %%mm7, %%mm5 \n\t" \
01089 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01090 "paddw %%mm5, %%mm1 \n\t" \
01091 "paddw %%mm6, %%mm2 \n\t" \
01092 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
01093 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01094 "paddw %%mm6, %%mm3 \n\t" \
01095 "paddw %%mm5, %%mm4 \n\t" \
01096 "paddw %%mm2, %%mm2 \n\t" \
01097 "psubw %%mm2, %%mm3 \n\t" \
01098 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01099 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01100 "psubw %%mm4, %%mm3 \n\t" \
01101 "paddw %5, %%mm1 \n\t"\
01102 "paddw %%mm1, %%mm3 \n\t" \
01103 "psraw $5, %%mm3 \n\t"\
01104 "packuswb %%mm3, %%mm0 \n\t"\
01105 OP_MMX2(%%mm0, (%1), %%mm4, q)\
01106 \
01107 "add %3, %0 \n\t"\
01108 "add %4, %1 \n\t"\
01109 "decl %2 \n\t"\
01110 " jnz 1b \n\t"\
01111 : "+a"(src), "+c"(dst), "+d"(h)\
01112 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ROUNDER)\
01113 : "memory"\
01114 );\
01115 }\
01116 \
01117 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01118 int i;\
01119 int16_t temp[8];\
01120 \
01121 for(i=0; i<h; i++)\
01122 {\
01123 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01124 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01125 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01126 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01127 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01128 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
01129 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
01130 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
01131 asm volatile(\
01132 "movq (%0), %%mm0 \n\t"\
01133 "movq 8(%0), %%mm1 \n\t"\
01134 "paddw %2, %%mm0 \n\t"\
01135 "paddw %2, %%mm1 \n\t"\
01136 "psraw $5, %%mm0