dsputil_mmx.c

Go to the documentation of this file.
00001 /*
00002  * MMX optimized DSP utils
00003  * Copyright (c) 2000, 2001 Fabrice Bellard.
00004  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  *
00022  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
00023  */
00024 
00025 #include "libavutil/x86_cpu.h"
00026 #include "libavcodec/dsputil.h"
00027 #include "libavcodec/h263.h"
00028 #include "libavcodec/mpegvideo.h"
00029 #include "libavcodec/simple_idct.h"
00030 #include "dsputil_mmx.h"
00031 #include "mmx.h"
00032 #include "vp3dsp_mmx.h"
00033 #include "vp3dsp_sse2.h"
00034 #include "idct_xvid.h"
00035 
00036 //#undef NDEBUG
00037 //#include <assert.h>
00038 
00039 int mm_flags; /* multimedia extension flags */
00040 
00041 /* pixel operations */
00042 DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
00043 DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
00044 
00045 DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
00046 {0x8000000080000000ULL, 0x8000000080000000ULL};
00047 
00048 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3  ) = 0x0003000300030003ULL;
00049 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4  ) = 0x0004000400040004ULL;
00050 DECLARE_ALIGNED_16(const xmm_t,    ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
00051 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8  ) = 0x0008000800080008ULL;
00052 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
00053 DECLARE_ALIGNED_16(const xmm_t,    ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
00054 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
00055 DECLARE_ALIGNED_16(const xmm_t,    ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
00056 DECLARE_ALIGNED_16(const xmm_t,    ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
00057 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
00058 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
00059 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
00060 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
00061 DECLARE_ALIGNED_8 (const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
00062 
00063 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1  ) = 0x0101010101010101ULL;
00064 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3  ) = 0x0303030303030303ULL;
00065 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7  ) = 0x0707070707070707ULL;
00066 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
00067 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
00068 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
00069 
00070 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
00071 DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
00072 
00073 #define JUMPALIGN() asm volatile (ASMALIGN(3)::)
00074 #define MOVQ_ZERO(regd)  asm volatile ("pxor %%" #regd ", %%" #regd ::)
00075 
00076 #define MOVQ_BFE(regd) \
00077     asm volatile ( \
00078     "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
00079     "paddb %%" #regd ", %%" #regd " \n\t" ::)
00080 
00081 #ifndef PIC
00082 #define MOVQ_BONE(regd)  asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
00083 #define MOVQ_WTWO(regd)  asm volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
00084 #else
00085 // for shared library it's better to use this way for accessing constants
00086 // pcmpeqd -> -1
00087 #define MOVQ_BONE(regd) \
00088     asm volatile ( \
00089     "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00090     "psrlw $15, %%" #regd " \n\t" \
00091     "packuswb %%" #regd ", %%" #regd " \n\t" ::)
00092 
00093 #define MOVQ_WTWO(regd) \
00094     asm volatile ( \
00095     "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00096     "psrlw $15, %%" #regd " \n\t" \
00097     "psllw $1, %%" #regd " \n\t"::)
00098 
00099 #endif
00100 
00101 // using regr as temporary and for the output result
00102 // first argument is unmodifed and second is trashed
00103 // regfe is supposed to contain 0xfefefefefefefefe
00104 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00105     "movq " #rega ", " #regr "  \n\t"\
00106     "pand " #regb ", " #regr "  \n\t"\
00107     "pxor " #rega ", " #regb "  \n\t"\
00108     "pand " #regfe "," #regb "  \n\t"\
00109     "psrlq $1, " #regb "        \n\t"\
00110     "paddb " #regb ", " #regr " \n\t"
00111 
00112 #define PAVGB_MMX(rega, regb, regr, regfe) \
00113     "movq " #rega ", " #regr "  \n\t"\
00114     "por  " #regb ", " #regr "  \n\t"\
00115     "pxor " #rega ", " #regb "  \n\t"\
00116     "pand " #regfe "," #regb "  \n\t"\
00117     "psrlq $1, " #regb "        \n\t"\
00118     "psubb " #regb ", " #regr " \n\t"
00119 
00120 // mm6 is supposed to contain 0xfefefefefefefefe
00121 #define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp) \
00122     "movq " #rega ", " #regr "  \n\t"\
00123     "movq " #regc ", " #regp "  \n\t"\
00124     "pand " #regb ", " #regr "  \n\t"\
00125     "pand " #regd ", " #regp "  \n\t"\
00126     "pxor " #rega ", " #regb "  \n\t"\
00127     "pxor " #regc ", " #regd "  \n\t"\
00128     "pand %%mm6, " #regb "      \n\t"\
00129     "pand %%mm6, " #regd "      \n\t"\
00130     "psrlq $1, " #regb "        \n\t"\
00131     "psrlq $1, " #regd "        \n\t"\
00132     "paddb " #regb ", " #regr " \n\t"\
00133     "paddb " #regd ", " #regp " \n\t"
00134 
00135 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00136     "movq " #rega ", " #regr "  \n\t"\
00137     "movq " #regc ", " #regp "  \n\t"\
00138     "por  " #regb ", " #regr "  \n\t"\
00139     "por  " #regd ", " #regp "  \n\t"\
00140     "pxor " #rega ", " #regb "  \n\t"\
00141     "pxor " #regc ", " #regd "  \n\t"\
00142     "pand %%mm6, " #regb "      \n\t"\
00143     "pand %%mm6, " #regd "      \n\t"\
00144     "psrlq $1, " #regd "        \n\t"\
00145     "psrlq $1, " #regb "        \n\t"\
00146     "psubb " #regb ", " #regr " \n\t"\
00147     "psubb " #regd ", " #regp " \n\t"
00148 
00149 /***********************************/
00150 /* MMX no rounding */
00151 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
00152 #define SET_RND  MOVQ_WONE
00153 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00154 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
00155 
00156 #include "dsputil_mmx_rnd.h"
00157 
00158 #undef DEF
00159 #undef SET_RND
00160 #undef PAVGBP
00161 #undef PAVGB
00162 /***********************************/
00163 /* MMX rounding */
00164 
00165 #define DEF(x, y) x ## _ ## y ##_mmx
00166 #define SET_RND  MOVQ_WTWO
00167 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
00168 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
00169 
00170 #include "dsputil_mmx_rnd.h"
00171 
00172 #undef DEF
00173 #undef SET_RND
00174 #undef PAVGBP
00175 #undef PAVGB
00176 
00177 /***********************************/
00178 /* 3Dnow specific */
00179 
00180 #define DEF(x) x ## _3dnow
00181 #define PAVGB "pavgusb"
00182 
00183 #include "dsputil_mmx_avg.h"
00184 
00185 #undef DEF
00186 #undef PAVGB
00187 
00188 /***********************************/
00189 /* MMX2 specific */
00190 
00191 #define DEF(x) x ## _mmx2
00192 
00193 /* Introduced only in MMX2 set */
00194 #define PAVGB "pavgb"
00195 
00196 #include "dsputil_mmx_avg.h"
00197 
00198 #undef DEF
00199 #undef PAVGB
00200 
00201 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
00202 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
00203 #define put_pixels16_mmx2 put_pixels16_mmx
00204 #define put_pixels8_mmx2 put_pixels8_mmx
00205 #define put_pixels4_mmx2 put_pixels4_mmx
00206 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
00207 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
00208 #define put_pixels16_3dnow put_pixels16_mmx
00209 #define put_pixels8_3dnow put_pixels8_mmx
00210 #define put_pixels4_3dnow put_pixels4_mmx
00211 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
00212 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
00213 
00214 /***********************************/
00215 /* standard MMX */
00216 
00217 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00218 {
00219     const DCTELEM *p;
00220     uint8_t *pix;
00221 
00222     /* read the pixels */
00223     p = block;
00224     pix = pixels;
00225     /* unrolled loop */
00226         asm volatile(
00227                 "movq   %3, %%mm0               \n\t"
00228                 "movq   8%3, %%mm1              \n\t"
00229                 "movq   16%3, %%mm2             \n\t"
00230                 "movq   24%3, %%mm3             \n\t"
00231                 "movq   32%3, %%mm4             \n\t"
00232                 "movq   40%3, %%mm5             \n\t"
00233                 "movq   48%3, %%mm6             \n\t"
00234                 "movq   56%3, %%mm7             \n\t"
00235                 "packuswb %%mm1, %%mm0          \n\t"
00236                 "packuswb %%mm3, %%mm2          \n\t"
00237                 "packuswb %%mm5, %%mm4          \n\t"
00238                 "packuswb %%mm7, %%mm6          \n\t"
00239                 "movq   %%mm0, (%0)             \n\t"
00240                 "movq   %%mm2, (%0, %1)         \n\t"
00241                 "movq   %%mm4, (%0, %1, 2)      \n\t"
00242                 "movq   %%mm6, (%0, %2)         \n\t"
00243                 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
00244                 :"memory");
00245         pix += line_size*4;
00246         p += 32;
00247 
00248     // if here would be an exact copy of the code above
00249     // compiler would generate some very strange code
00250     // thus using "r"
00251     asm volatile(
00252             "movq       (%3), %%mm0             \n\t"
00253             "movq       8(%3), %%mm1            \n\t"
00254             "movq       16(%3), %%mm2           \n\t"
00255             "movq       24(%3), %%mm3           \n\t"
00256             "movq       32(%3), %%mm4           \n\t"
00257             "movq       40(%3), %%mm5           \n\t"
00258             "movq       48(%3), %%mm6           \n\t"
00259             "movq       56(%3), %%mm7           \n\t"
00260             "packuswb %%mm1, %%mm0              \n\t"
00261             "packuswb %%mm3, %%mm2              \n\t"
00262             "packuswb %%mm5, %%mm4              \n\t"
00263             "packuswb %%mm7, %%mm6              \n\t"
00264             "movq       %%mm0, (%0)             \n\t"
00265             "movq       %%mm2, (%0, %1)         \n\t"
00266             "movq       %%mm4, (%0, %1, 2)      \n\t"
00267             "movq       %%mm6, (%0, %2)         \n\t"
00268             ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
00269             :"memory");
00270 }
00271 
00272 static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
00273   { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
00274 
00275 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00276 {
00277     int i;
00278 
00279     movq_m2r(*vector128, mm1);
00280     for (i = 0; i < 8; i++) {
00281         movq_m2r(*(block), mm0);
00282         packsswb_m2r(*(block + 4), mm0);
00283         block += 8;
00284         paddb_r2r(mm1, mm0);
00285         movq_r2m(mm0, *pixels);
00286         pixels += line_size;
00287     }
00288 }
00289 
00290 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00291 {
00292     const DCTELEM *p;
00293     uint8_t *pix;
00294     int i;
00295 
00296     /* read the pixels */
00297     p = block;
00298     pix = pixels;
00299     MOVQ_ZERO(mm7);
00300     i = 4;
00301     do {
00302         asm volatile(
00303                 "movq   (%2), %%mm0     \n\t"
00304                 "movq   8(%2), %%mm1    \n\t"
00305                 "movq   16(%2), %%mm2   \n\t"
00306                 "movq   24(%2), %%mm3   \n\t"
00307                 "movq   %0, %%mm4       \n\t"
00308                 "movq   %1, %%mm6       \n\t"
00309                 "movq   %%mm4, %%mm5    \n\t"
00310                 "punpcklbw %%mm7, %%mm4 \n\t"
00311                 "punpckhbw %%mm7, %%mm5 \n\t"
00312                 "paddsw %%mm4, %%mm0    \n\t"
00313                 "paddsw %%mm5, %%mm1    \n\t"
00314                 "movq   %%mm6, %%mm5    \n\t"
00315                 "punpcklbw %%mm7, %%mm6 \n\t"
00316                 "punpckhbw %%mm7, %%mm5 \n\t"
00317                 "paddsw %%mm6, %%mm2    \n\t"
00318                 "paddsw %%mm5, %%mm3    \n\t"
00319                 "packuswb %%mm1, %%mm0  \n\t"
00320                 "packuswb %%mm3, %%mm2  \n\t"
00321                 "movq   %%mm0, %0       \n\t"
00322                 "movq   %%mm2, %1       \n\t"
00323                 :"+m"(*pix), "+m"(*(pix+line_size))
00324                 :"r"(p)
00325                 :"memory");
00326         pix += line_size*2;
00327         p += 16;
00328     } while (--i);
00329 }
00330 
00331 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00332 {
00333     asm volatile(
00334          "lea (%3, %3), %%"REG_a"       \n\t"
00335          ASMALIGN(3)
00336          "1:                            \n\t"
00337          "movd (%1), %%mm0              \n\t"
00338          "movd (%1, %3), %%mm1          \n\t"
00339          "movd %%mm0, (%2)              \n\t"
00340          "movd %%mm1, (%2, %3)          \n\t"
00341          "add %%"REG_a", %1             \n\t"
00342          "add %%"REG_a", %2             \n\t"
00343          "movd (%1), %%mm0              \n\t"
00344          "movd (%1, %3), %%mm1          \n\t"
00345          "movd %%mm0, (%2)              \n\t"
00346          "movd %%mm1, (%2, %3)          \n\t"
00347          "add %%"REG_a", %1             \n\t"
00348          "add %%"REG_a", %2             \n\t"
00349          "subl $4, %0                   \n\t"
00350          "jnz 1b                        \n\t"
00351          : "+g"(h), "+r" (pixels),  "+r" (block)
00352          : "r"((x86_reg)line_size)
00353          : "%"REG_a, "memory"
00354         );
00355 }
00356 
00357 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00358 {
00359     asm volatile(
00360          "lea (%3, %3), %%"REG_a"       \n\t"
00361          ASMALIGN(3)
00362          "1:                            \n\t"
00363          "movq (%1), %%mm0              \n\t"
00364          "movq (%1, %3), %%mm1          \n\t"
00365          "movq %%mm0, (%2)              \n\t"
00366          "movq %%mm1, (%2, %3)          \n\t"
00367          "add %%"REG_a", %1             \n\t"
00368          "add %%"REG_a", %2             \n\t"
00369          "movq (%1), %%mm0              \n\t"
00370          "movq (%1, %3), %%mm1          \n\t"
00371          "movq %%mm0, (%2)              \n\t"
00372          "movq %%mm1, (%2, %3)          \n\t"
00373          "add %%"REG_a", %1             \n\t"
00374          "add %%"REG_a", %2             \n\t"
00375          "subl $4, %0                   \n\t"
00376          "jnz 1b                        \n\t"
00377          : "+g"(h), "+r" (pixels),  "+r" (block)
00378          : "r"((x86_reg)line_size)
00379          : "%"REG_a, "memory"
00380         );
00381 }
00382 
00383 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00384 {
00385     asm volatile(
00386          "lea (%3, %3), %%"REG_a"       \n\t"
00387          ASMALIGN(3)
00388          "1:                            \n\t"
00389          "movq (%1), %%mm0              \n\t"
00390          "movq 8(%1), %%mm4             \n\t"
00391          "movq (%1, %3), %%mm1          \n\t"
00392          "movq 8(%1, %3), %%mm5         \n\t"
00393          "movq %%mm0, (%2)              \n\t"
00394          "movq %%mm4, 8(%2)             \n\t"
00395          "movq %%mm1, (%2, %3)          \n\t"
00396          "movq %%mm5, 8(%2, %3)         \n\t"
00397          "add %%"REG_a", %1             \n\t"
00398          "add %%"REG_a", %2             \n\t"
00399          "movq (%1), %%mm0              \n\t"
00400          "movq 8(%1), %%mm4             \n\t"
00401          "movq (%1, %3), %%mm1          \n\t"
00402          "movq 8(%1, %3), %%mm5         \n\t"
00403          "movq %%mm0, (%2)              \n\t"
00404          "movq %%mm4, 8(%2)             \n\t"
00405          "movq %%mm1, (%2, %3)          \n\t"
00406          "movq %%mm5, 8(%2, %3)         \n\t"
00407          "add %%"REG_a", %1             \n\t"
00408          "add %%"REG_a", %2             \n\t"
00409          "subl $4, %0                   \n\t"
00410          "jnz 1b                        \n\t"
00411          : "+g"(h), "+r" (pixels),  "+r" (block)
00412          : "r"((x86_reg)line_size)
00413          : "%"REG_a, "memory"
00414         );
00415 }
00416 
00417 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00418 {
00419     asm volatile(
00420          "1:                            \n\t"
00421          "movdqu (%1), %%xmm0           \n\t"
00422          "movdqu (%1,%3), %%xmm1        \n\t"
00423          "movdqu (%1,%3,2), %%xmm2      \n\t"
00424          "movdqu (%1,%4), %%xmm3        \n\t"
00425          "movdqa %%xmm0, (%2)           \n\t"
00426          "movdqa %%xmm1, (%2,%3)        \n\t"
00427          "movdqa %%xmm2, (%2,%3,2)      \n\t"
00428          "movdqa %%xmm3, (%2,%4)        \n\t"
00429          "subl $4, %0                   \n\t"
00430          "lea (%1,%3,4), %1             \n\t"
00431          "lea (%2,%3,4), %2             \n\t"
00432          "jnz 1b                        \n\t"
00433          : "+g"(h), "+r" (pixels),  "+r" (block)
00434          : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00435          : "memory"
00436         );
00437 }
00438 
00439 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00440 {
00441     asm volatile(
00442          "1:                            \n\t"
00443          "movdqu (%1), %%xmm0           \n\t"
00444          "movdqu (%1,%3), %%xmm1        \n\t"
00445          "movdqu (%1,%3,2), %%xmm2      \n\t"
00446          "movdqu (%1,%4), %%xmm3        \n\t"
00447          "pavgb  (%2), %%xmm0           \n\t"
00448          "pavgb  (%2,%3), %%xmm1        \n\t"
00449          "pavgb  (%2,%3,2), %%xmm2      \n\t"
00450          "pavgb  (%2,%4), %%xmm3        \n\t"
00451          "movdqa %%xmm0, (%2)           \n\t"
00452          "movdqa %%xmm1, (%2,%3)        \n\t"
00453          "movdqa %%xmm2, (%2,%3,2)      \n\t"
00454          "movdqa %%xmm3, (%2,%4)        \n\t"
00455          "subl $4, %0                   \n\t"
00456          "lea (%1,%3,4), %1             \n\t"
00457          "lea (%2,%3,4), %2             \n\t"
00458          "jnz 1b                        \n\t"
00459          : "+g"(h), "+r" (pixels),  "+r" (block)
00460          : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00461          : "memory"
00462         );
00463 }
00464 
00465 static void clear_blocks_mmx(DCTELEM *blocks)
00466 {
00467     asm volatile(
00468                 "pxor %%mm7, %%mm7              \n\t"
00469                 "mov $-128*6, %%"REG_a"         \n\t"
00470                 "1:                             \n\t"
00471                 "movq %%mm7, (%0, %%"REG_a")    \n\t"
00472                 "movq %%mm7, 8(%0, %%"REG_a")   \n\t"
00473                 "movq %%mm7, 16(%0, %%"REG_a")  \n\t"
00474                 "movq %%mm7, 24(%0, %%"REG_a")  \n\t"
00475                 "add $32, %%"REG_a"             \n\t"
00476                 " js 1b                         \n\t"
00477                 : : "r" (((uint8_t *)blocks)+128*6)
00478                 : "%"REG_a
00479         );
00480 }
00481 
00482 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
00483     x86_reg i=0;
00484     asm volatile(
00485         "jmp 2f                         \n\t"
00486         "1:                             \n\t"
00487         "movq  (%1, %0), %%mm0          \n\t"
00488         "movq  (%2, %0), %%mm1          \n\t"
00489         "paddb %%mm0, %%mm1             \n\t"
00490         "movq %%mm1, (%2, %0)           \n\t"
00491         "movq 8(%1, %0), %%mm0          \n\t"
00492         "movq 8(%2, %0), %%mm1          \n\t"
00493         "paddb %%mm0, %%mm1             \n\t"
00494         "movq %%mm1, 8(%2, %0)          \n\t"
00495         "add $16, %0                    \n\t"
00496         "2:                             \n\t"
00497         "cmp %3, %0                     \n\t"
00498         " js 1b                         \n\t"
00499         : "+r" (i)
00500         : "r"(src), "r"(dst), "r"((x86_reg)w-15)
00501     );
00502     for(; i<w; i++)
00503         dst[i+0] += src[i+0];
00504 }
00505 
00506 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
00507     x86_reg i=0;
00508     asm volatile(
00509         "jmp 2f                         \n\t"
00510         "1:                             \n\t"
00511         "movq   (%2, %0), %%mm0         \n\t"
00512         "movq  8(%2, %0), %%mm1         \n\t"
00513         "paddb  (%3, %0), %%mm0         \n\t"
00514         "paddb 8(%3, %0), %%mm1         \n\t"
00515         "movq %%mm0,  (%1, %0)          \n\t"
00516         "movq %%mm1, 8(%1, %0)          \n\t"
00517         "add $16, %0                    \n\t"
00518         "2:                             \n\t"
00519         "cmp %4, %0                     \n\t"
00520         " js 1b                         \n\t"
00521         : "+r" (i)
00522         : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
00523     );
00524     for(; i<w; i++)
00525         dst[i] = src1[i] + src2[i];
00526 }
00527 
00528 #define H263_LOOP_FILTER \
00529         "pxor %%mm7, %%mm7              \n\t"\
00530         "movq  %0, %%mm0                \n\t"\
00531         "movq  %0, %%mm1                \n\t"\
00532         "movq  %3, %%mm2                \n\t"\
00533         "movq  %3, %%mm3                \n\t"\
00534         "punpcklbw %%mm7, %%mm0         \n\t"\
00535         "punpckhbw %%mm7, %%mm1         \n\t"\
00536         "punpcklbw %%mm7, %%mm2         \n\t"\
00537         "punpckhbw %%mm7, %%mm3         \n\t"\
00538         "psubw %%mm2, %%mm0             \n\t"\
00539         "psubw %%mm3, %%mm1             \n\t"\
00540         "movq  %1, %%mm2                \n\t"\
00541         "movq  %1, %%mm3                \n\t"\
00542         "movq  %2, %%mm4                \n\t"\
00543         "movq  %2, %%mm5                \n\t"\
00544         "punpcklbw %%mm7, %%mm2         \n\t"\
00545         "punpckhbw %%mm7, %%mm3         \n\t"\
00546         "punpcklbw %%mm7, %%mm4         \n\t"\
00547         "punpckhbw %%mm7, %%mm5         \n\t"\
00548         "psubw %%mm2, %%mm4             \n\t"\
00549         "psubw %%mm3, %%mm5             \n\t"\
00550         "psllw $2, %%mm4                \n\t"\
00551         "psllw $2, %%mm5                \n\t"\
00552         "paddw %%mm0, %%mm4             \n\t"\
00553         "paddw %%mm1, %%mm5             \n\t"\
00554         "pxor %%mm6, %%mm6              \n\t"\
00555         "pcmpgtw %%mm4, %%mm6           \n\t"\
00556         "pcmpgtw %%mm5, %%mm7           \n\t"\
00557         "pxor %%mm6, %%mm4              \n\t"\
00558         "pxor %%mm7, %%mm5              \n\t"\
00559         "psubw %%mm6, %%mm4             \n\t"\
00560         "psubw %%mm7, %%mm5             \n\t"\
00561         "psrlw $3, %%mm4                \n\t"\
00562         "psrlw $3, %%mm5                \n\t"\
00563         "packuswb %%mm5, %%mm4          \n\t"\
00564         "packsswb %%mm7, %%mm6          \n\t"\
00565         "pxor %%mm7, %%mm7              \n\t"\
00566         "movd %4, %%mm2                 \n\t"\
00567         "punpcklbw %%mm2, %%mm2         \n\t"\
00568         "punpcklbw %%mm2, %%mm2         \n\t"\
00569         "punpcklbw %%mm2, %%mm2         \n\t"\
00570         "psubusb %%mm4, %%mm2           \n\t"\
00571         "movq %%mm2, %%mm3              \n\t"\
00572         "psubusb %%mm4, %%mm3           \n\t"\
00573         "psubb %%mm3, %%mm2             \n\t"\
00574         "movq %1, %%mm3                 \n\t"\
00575         "movq %2, %%mm4                 \n\t"\
00576         "pxor %%mm6, %%mm3              \n\t"\
00577         "pxor %%mm6, %%mm4              \n\t"\
00578         "paddusb %%mm2, %%mm3           \n\t"\
00579         "psubusb %%mm2, %%mm4           \n\t"\
00580         "pxor %%mm6, %%mm3              \n\t"\
00581         "pxor %%mm6, %%mm4              \n\t"\
00582         "paddusb %%mm2, %%mm2           \n\t"\
00583         "packsswb %%mm1, %%mm0          \n\t"\
00584         "pcmpgtb %%mm0, %%mm7           \n\t"\
00585         "pxor %%mm7, %%mm0              \n\t"\
00586         "psubb %%mm7, %%mm0             \n\t"\
00587         "movq %%mm0, %%mm1              \n\t"\
00588         "psubusb %%mm2, %%mm0           \n\t"\
00589         "psubb %%mm0, %%mm1             \n\t"\
00590         "pand %5, %%mm1                 \n\t"\
00591         "psrlw $2, %%mm1                \n\t"\
00592         "pxor %%mm7, %%mm1              \n\t"\
00593         "psubb %%mm7, %%mm1             \n\t"\
00594         "movq %0, %%mm5                 \n\t"\
00595         "movq %3, %%mm6                 \n\t"\
00596         "psubb %%mm1, %%mm5             \n\t"\
00597         "paddb %%mm1, %%mm6             \n\t"
00598 
00599 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00600     if(ENABLE_ANY_H263) {
00601     const int strength= ff_h263_loop_filter_strength[qscale];
00602 
00603     asm volatile(
00604 
00605         H263_LOOP_FILTER
00606 
00607         "movq %%mm3, %1                 \n\t"
00608         "movq %%mm4, %2                 \n\t"
00609         "movq %%mm5, %0                 \n\t"
00610         "movq %%mm6, %3                 \n\t"
00611         : "+m" (*(uint64_t*)(src - 2*stride)),
00612           "+m" (*(uint64_t*)(src - 1*stride)),
00613           "+m" (*(uint64_t*)(src + 0*stride)),
00614           "+m" (*(uint64_t*)(src + 1*stride))
00615         : "g" (2*strength), "m"(ff_pb_FC)
00616     );
00617     }
00618 }
00619 
00620 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
00621     asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
00622         "movd  %4, %%mm0                \n\t"
00623         "movd  %5, %%mm1                \n\t"
00624         "movd  %6, %%mm2                \n\t"
00625         "movd  %7, %%mm3                \n\t"
00626         "punpcklbw %%mm1, %%mm0         \n\t"
00627         "punpcklbw %%mm3, %%mm2         \n\t"
00628         "movq %%mm0, %%mm1              \n\t"
00629         "punpcklwd %%mm2, %%mm0         \n\t"
00630         "punpckhwd %%mm2, %%mm1         \n\t"
00631         "movd  %%mm0, %0                \n\t"
00632         "punpckhdq %%mm0, %%mm0         \n\t"
00633         "movd  %%mm0, %1                \n\t"
00634         "movd  %%mm1, %2                \n\t"
00635         "punpckhdq %%mm1, %%mm1         \n\t"
00636         "movd  %%mm1, %3                \n\t"
00637 
00638         : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
00639           "=m" (*(uint32_t*)(dst + 1*dst_stride)),
00640           "=m" (*(uint32_t*)(dst + 2*dst_stride)),
00641           "=m" (*(uint32_t*)(dst + 3*dst_stride))
00642         :  "m" (*(uint32_t*)(src + 0*src_stride)),
00643            "m" (*(uint32_t*)(src + 1*src_stride)),
00644            "m" (*(uint32_t*)(src + 2*src_stride)),
00645            "m" (*(uint32_t*)(src + 3*src_stride))
00646     );
00647 }
00648 
00649 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00650     if(ENABLE_ANY_H263) {
00651     const int strength= ff_h263_loop_filter_strength[qscale];
00652     DECLARE_ALIGNED(8, uint64_t, temp[4]);
00653     uint8_t *btemp= (uint8_t*)temp;
00654 
00655     src -= 2;
00656 
00657     transpose4x4(btemp  , src           , 8, stride);
00658     transpose4x4(btemp+4, src + 4*stride, 8, stride);
00659     asm volatile(
00660         H263_LOOP_FILTER // 5 3 4 6
00661 
00662         : "+m" (temp[0]),
00663           "+m" (temp[1]),
00664           "+m" (temp[2]),
00665           "+m" (temp[3])
00666         : "g" (2*strength), "m"(ff_pb_FC)
00667     );
00668 
00669     asm volatile(
00670         "movq %%mm5, %%mm1              \n\t"
00671         "movq %%mm4, %%mm0              \n\t"
00672         "punpcklbw %%mm3, %%mm5         \n\t"
00673         "punpcklbw %%mm6, %%mm4         \n\t"
00674         "punpckhbw %%mm3, %%mm1         \n\t"
00675         "punpckhbw %%mm6, %%mm0         \n\t"
00676         "movq %%mm5, %%mm3              \n\t"
00677         "movq %%mm1, %%mm6              \n\t"
00678         "punpcklwd %%mm4, %%mm5         \n\t"
00679         "punpcklwd %%mm0, %%mm1         \n\t"
00680         "punpckhwd %%mm4, %%mm3         \n\t"
00681         "punpckhwd %%mm0, %%mm6         \n\t"
00682         "movd %%mm5, (%0)               \n\t"
00683         "punpckhdq %%mm5, %%mm5         \n\t"
00684         "movd %%mm5, (%0,%2)            \n\t"
00685         "movd %%mm3, (%0,%2,2)          \n\t"
00686         "punpckhdq %%mm3, %%mm3         \n\t"
00687         "movd %%mm3, (%0,%3)            \n\t"
00688         "movd %%mm1, (%1)               \n\t"
00689         "punpckhdq %%mm1, %%mm1         \n\t"
00690         "movd %%mm1, (%1,%2)            \n\t"
00691         "movd %%mm6, (%1,%2,2)          \n\t"
00692         "punpckhdq %%mm6, %%mm6         \n\t"
00693         "movd %%mm6, (%1,%3)            \n\t"
00694         :: "r" (src),
00695            "r" (src + 4*stride),
00696            "r" ((x86_reg)   stride ),
00697            "r" ((x86_reg)(3*stride))
00698     );
00699     }
00700 }
00701 
00702 /* draw the edges of width 'w' of an image of size width, height
00703    this mmx version can only handle w==8 || w==16 */
00704 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
00705 {
00706     uint8_t *ptr, *last_line;
00707     int i;
00708 
00709     last_line = buf + (height - 1) * wrap;
00710     /* left and right */
00711     ptr = buf;
00712     if(w==8)
00713     {
00714         asm volatile(
00715                 "1:                             \n\t"
00716                 "movd (%0), %%mm0               \n\t"
00717                 "punpcklbw %%mm0, %%mm0         \n\t"
00718                 "punpcklwd %%mm0, %%mm0         \n\t"
00719                 "punpckldq %%mm0, %%mm0         \n\t"
00720                 "movq %%mm0, -8(%0)             \n\t"
00721                 "movq -8(%0, %2), %%mm1         \n\t"
00722                 "punpckhbw %%mm1, %%mm1         \n\t"
00723                 "punpckhwd %%mm1, %%mm1         \n\t"
00724                 "punpckhdq %%mm1, %%mm1         \n\t"
00725                 "movq %%mm1, (%0, %2)           \n\t"
00726                 "add %1, %0                     \n\t"
00727                 "cmp %3, %0                     \n\t"
00728                 " jb 1b                         \n\t"
00729                 : "+r" (ptr)
00730                 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00731         );
00732     }
00733     else
00734     {
00735         asm volatile(
00736                 "1:                             \n\t"
00737                 "movd (%0), %%mm0               \n\t"
00738                 "punpcklbw %%mm0, %%mm0         \n\t"
00739                 "punpcklwd %%mm0, %%mm0         \n\t"
00740                 "punpckldq %%mm0, %%mm0         \n\t"
00741                 "movq %%mm0, -8(%0)             \n\t"
00742                 "movq %%mm0, -16(%0)            \n\t"
00743                 "movq -8(%0, %2), %%mm1         \n\t"
00744                 "punpckhbw %%mm1, %%mm1         \n\t"
00745                 "punpckhwd %%mm1, %%mm1         \n\t"
00746                 "punpckhdq %%mm1, %%mm1         \n\t"
00747                 "movq %%mm1, (%0, %2)           \n\t"
00748                 "movq %%mm1, 8(%0, %2)          \n\t"
00749                 "add %1, %0                     \n\t"
00750                 "cmp %3, %0                     \n\t"
00751                 " jb 1b                         \n\t"
00752                 : "+r" (ptr)
00753                 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
00754         );
00755     }
00756 
00757     for(i=0;i<w;i+=4) {
00758         /* top and bottom (and hopefully also the corners) */
00759         ptr= buf - (i + 1) * wrap - w;
00760         asm volatile(
00761                 "1:                             \n\t"
00762                 "movq (%1, %0), %%mm0           \n\t"
00763                 "movq %%mm0, (%0)               \n\t"
00764                 "movq %%mm0, (%0, %2)           \n\t"
00765                 "movq %%mm0, (%0, %2, 2)        \n\t"
00766                 "movq %%mm0, (%0, %3)           \n\t"
00767                 "add $8, %0                     \n\t"
00768                 "cmp %4, %0                     \n\t"
00769                 " jb 1b                         \n\t"
00770                 : "+r" (ptr)
00771                 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
00772         );
00773         ptr= last_line + (i + 1) * wrap - w;
00774         asm volatile(
00775                 "1:                             \n\t"
00776                 "movq (%1, %0), %%mm0           \n\t"
00777                 "movq %%mm0, (%0)               \n\t"
00778                 "movq %%mm0, (%0, %2)           \n\t"
00779                 "movq %%mm0, (%0, %2, 2)        \n\t"
00780                 "movq %%mm0, (%0, %3)           \n\t"
00781                 "add $8, %0                     \n\t"
00782                 "cmp %4, %0                     \n\t"
00783                 " jb 1b                         \n\t"
00784                 : "+r" (ptr)
00785                 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
00786         );
00787     }
00788 }
00789 
00790 #define PAETH(cpu, abs3)\
00791 void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
00792 {\
00793     x86_reg i = -bpp;\
00794     x86_reg end = w-3;\
00795     asm volatile(\
00796         "pxor      %%mm7, %%mm7 \n"\
00797         "movd    (%1,%0), %%mm0 \n"\
00798         "movd    (%2,%0), %%mm1 \n"\
00799         "punpcklbw %%mm7, %%mm0 \n"\
00800         "punpcklbw %%mm7, %%mm1 \n"\
00801         "add       %4, %0 \n"\
00802         "1: \n"\
00803         "movq      %%mm1, %%mm2 \n"\
00804         "movd    (%2,%0), %%mm1 \n"\
00805         "movq      %%mm2, %%mm3 \n"\
00806         "punpcklbw %%mm7, %%mm1 \n"\
00807         "movq      %%mm2, %%mm4 \n"\
00808         "psubw     %%mm1, %%mm3 \n"\
00809         "psubw     %%mm0, %%mm4 \n"\
00810         "movq      %%mm3, %%mm5 \n"\
00811         "paddw     %%mm4, %%mm5 \n"\
00812         abs3\
00813         "movq      %%mm4, %%mm6 \n"\
00814         "pminsw    %%mm5, %%mm6 \n"\
00815         "pcmpgtw   %%mm6, %%mm3 \n"\
00816         "pcmpgtw   %%mm5, %%mm4 \n"\
00817         "movq      %%mm4, %%mm6 \n"\
00818         "pand      %%mm3, %%mm4 \n"\
00819         "pandn     %%mm3, %%mm6 \n"\
00820         "pandn     %%mm0, %%mm3 \n"\
00821         "movd    (%3,%0), %%mm0 \n"\
00822         "pand      %%mm1, %%mm6 \n"\
00823         "pand      %%mm4, %%mm2 \n"\
00824         "punpcklbw %%mm7, %%mm0 \n"\
00825         "movq      %6,    %%mm5 \n"\
00826         "paddw     %%mm6, %%mm0 \n"\
00827         "paddw     %%mm2, %%mm3 \n"\
00828         "paddw     %%mm3, %%mm0 \n"\
00829         "pand      %%mm5, %%mm0 \n"\
00830         "movq      %%mm0, %%mm3 \n"\
00831         "packuswb  %%mm3, %%mm3 \n"\
00832         "movd      %%mm3, (%1,%0) \n"\
00833         "add       %4, %0 \n"\
00834         "cmp       %5, %0 \n"\
00835         "jle 1b \n"\
00836         :"+r"(i)\
00837         :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
00838          "m"(ff_pw_255)\
00839         :"memory"\
00840     );\
00841 }
00842 
00843 #define ABS3_MMX2\
00844         "psubw     %%mm5, %%mm7 \n"\
00845         "pmaxsw    %%mm7, %%mm5 \n"\
00846         "pxor      %%mm6, %%mm6 \n"\
00847         "pxor      %%mm7, %%mm7 \n"\
00848         "psubw     %%mm3, %%mm6 \n"\
00849         "psubw     %%mm4, %%mm7 \n"\
00850         "pmaxsw    %%mm6, %%mm3 \n"\
00851         "pmaxsw    %%mm7, %%mm4 \n"\
00852         "pxor      %%mm7, %%mm7 \n"
00853 
00854 #define ABS3_SSSE3\
00855         "pabsw     %%mm3, %%mm3 \n"\
00856         "pabsw     %%mm4, %%mm4 \n"\
00857         "pabsw     %%mm5, %%mm5 \n"
00858 
00859 PAETH(mmx2, ABS3_MMX2)
00860 #ifdef HAVE_SSSE3
00861 PAETH(ssse3, ABS3_SSSE3)
00862 #endif
00863 
00864 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
00865         "paddw " #m4 ", " #m3 "           \n\t" /* x1 */\
00866         "movq "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */\
00867         "pmullw " #m3 ", %%mm4            \n\t" /* 20x1 */\
00868         "movq "#in7", " #m3 "             \n\t" /* d */\
00869         "movq "#in0", %%mm5               \n\t" /* D */\
00870         "paddw " #m3 ", %%mm5             \n\t" /* x4 */\
00871         "psubw %%mm5, %%mm4               \n\t" /* 20x1 - x4 */\
00872         "movq "#in1", %%mm5               \n\t" /* C */\
00873         "movq "#in2", %%mm6               \n\t" /* B */\
00874         "paddw " #m6 ", %%mm5             \n\t" /* x3 */\
00875         "paddw " #m5 ", %%mm6             \n\t" /* x2 */\
00876         "paddw %%mm6, %%mm6               \n\t" /* 2x2 */\
00877         "psubw %%mm6, %%mm5               \n\t" /* -2x2 + x3 */\
00878         "pmullw "MANGLE(ff_pw_3)", %%mm5  \n\t" /* -6x2 + 3x3 */\
00879         "paddw " #rnd ", %%mm4            \n\t" /* x2 */\
00880         "paddw %%mm4, %%mm5               \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
00881         "psraw $5, %%mm5                  \n\t"\
00882         "packuswb %%mm5, %%mm5            \n\t"\
00883         OP(%%mm5, out, %%mm7, d)
00884 
00885 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
00886 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00887     uint64_t temp;\
00888 \
00889     asm volatile(\
00890         "pxor %%mm7, %%mm7                \n\t"\
00891         "1:                               \n\t"\
00892         "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
00893         "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
00894         "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
00895         "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
00896         "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
00897         "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
00898         "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
00899         "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
00900         "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
00901         "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
00902         "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
00903         "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
00904         "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
00905         "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
00906         "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
00907         "paddw %%mm3, %%mm5               \n\t" /* b */\
00908         "paddw %%mm2, %%mm6               \n\t" /* c */\
00909         "paddw %%mm5, %%mm5               \n\t" /* 2b */\
00910         "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
00911         "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
00912         "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
00913         "paddw %%mm4, %%mm0               \n\t" /* a */\
00914         "paddw %%mm1, %%mm5               \n\t" /* d */\
00915         "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
00916         "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
00917         "paddw %6, %%mm6                  \n\t"\
00918         "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
00919         "psraw $5, %%mm0                  \n\t"\
00920         "movq %%mm0, %5                   \n\t"\
00921         /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
00922         \
00923         "movq 5(%0), %%mm0                \n\t" /* FGHIJKLM */\
00924         "movq %%mm0, %%mm5                \n\t" /* FGHIJKLM */\
00925         "movq %%mm0, %%mm6                \n\t" /* FGHIJKLM */\
00926         "psrlq $8, %%mm0                  \n\t" /* GHIJKLM0 */\
00927         "psrlq $16, %%mm5                 \n\t" /* HIJKLM00 */\
00928         "punpcklbw %%mm7, %%mm0           \n\t" /* 0G0H0I0J */\
00929         "punpcklbw %%mm7, %%mm5           \n\t" /* 0H0I0J0K */\
00930         "paddw %%mm0, %%mm2               \n\t" /* b */\
00931         "paddw %%mm5, %%mm3               \n\t" /* c */\
00932         "paddw %%mm2, %%mm2               \n\t" /* 2b */\
00933         "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
00934         "movq %%mm6, %%mm2                \n\t" /* FGHIJKLM */\
00935         "psrlq $24, %%mm6                 \n\t" /* IJKLM000 */\
00936         "punpcklbw %%mm7, %%mm2           \n\t" /* 0F0G0H0I */\
00937         "punpcklbw %%mm7, %%mm6           \n\t" /* 0I0J0K0L */\
00938         "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
00939         "paddw %%mm2, %%mm1               \n\t" /* a */\
00940         "paddw %%mm6, %%mm4               \n\t" /* d */\
00941         "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
00942         "psubw %%mm4, %%mm3               \n\t" /* - 6b +3c - d */\
00943         "paddw %6, %%mm1                  \n\t"\
00944         "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b +3c - d */\
00945         "psraw $5, %%mm3                  \n\t"\
00946         "movq %5, %%mm1                   \n\t"\
00947         "packuswb %%mm3, %%mm1            \n\t"\
00948         OP_MMX2(%%mm1, (%1),%%mm4, q)\
00949         /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
00950         \
00951         "movq 9(%0), %%mm1                \n\t" /* JKLMNOPQ */\
00952         "movq %%mm1, %%mm4                \n\t" /* JKLMNOPQ */\
00953         "movq %%mm1, %%mm3                \n\t" /* JKLMNOPQ */\
00954         "psrlq $8, %%mm1                  \n\t" /* KLMNOPQ0 */\
00955         "psrlq $16, %%mm4                 \n\t" /* LMNOPQ00 */\
00956         "punpcklbw %%mm7, %%mm1           \n\t" /* 0K0L0M0N */\
00957         "punpcklbw %%mm7, %%mm4           \n\t" /* 0L0M0N0O */\
00958         "paddw %%mm1, %%mm5               \n\t" /* b */\
00959         "paddw %%mm4, %%mm0               \n\t" /* c */\
00960         "paddw %%mm5, %%mm5               \n\t" /* 2b */\
00961         "psubw %%mm5, %%mm0               \n\t" /* c - 2b */\
00962         "movq %%mm3, %%mm5                \n\t" /* JKLMNOPQ */\
00963         "psrlq $24, %%mm3                 \n\t" /* MNOPQ000 */\
00964         "pmullw "MANGLE(ff_pw_3)", %%mm0  \n\t" /* 3c - 6b */\
00965         "punpcklbw %%mm7, %%mm3           \n\t" /* 0M0N0O0P */\
00966         "paddw %%mm3, %%mm2               \n\t" /* d */\
00967         "psubw %%mm2, %%mm0               \n\t" /* -6b + 3c - d */\
00968         "movq %%mm5, %%mm2                \n\t" /* JKLMNOPQ */\
00969         "punpcklbw %%mm7, %%mm2           \n\t" /* 0J0K0L0M */\
00970         "punpckhbw %%mm7, %%mm5           \n\t" /* 0N0O0P0Q */\
00971         "paddw %%mm2, %%mm6               \n\t" /* a */\
00972         "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
00973         "paddw %6, %%mm0                  \n\t"\
00974         "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
00975         "psraw $5, %%mm0                  \n\t"\
00976         /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
00977         \
00978         "paddw %%mm5, %%mm3               \n\t" /* a */\
00979         "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */\
00980         "paddw %%mm4, %%mm6               \n\t" /* b */\
00981         "pshufw $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */\
00982         "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */\
00983         "paddw %%mm1, %%mm4               \n\t" /* c */\
00984         "paddw %%mm2, %%mm5               \n\t" /* d */\
00985         "paddw %%mm6, %%mm6               \n\t" /* 2b */\
00986         "psubw %%mm6, %%mm4               \n\t" /* c - 2b */\
00987         "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
00988         "pmullw "MANGLE(ff_pw_3)", %%mm4  \n\t" /* 3c - 6b */\
00989         "psubw %%mm5, %%mm3               \n\t" /* -6b + 3c - d */\
00990         "paddw %6, %%mm4                  \n\t"\
00991         "paddw %%mm3, %%mm4               \n\t" /* 20a - 6b + 3c - d */\
00992         "psraw $5, %%mm4                  \n\t"\
00993         "packuswb %%mm4, %%mm0            \n\t"\
00994         OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
00995         \
00996         "add %3, %0                       \n\t"\
00997         "add %4, %1                       \n\t"\
00998         "decl %2                          \n\t"\
00999         " jnz 1b                          \n\t"\
01000         : "+a"(src), "+c"(dst), "+D"(h)\
01001         : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
01002         : "memory"\
01003     );\
01004 }\
01005 \
01006 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01007     int i;\
01008     int16_t temp[16];\
01009     /* quick HACK, XXX FIXME MUST be optimized */\
01010     for(i=0; i<h; i++)\
01011     {\
01012         temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01013         temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01014         temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01015         temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01016         temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01017         temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
01018         temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
01019         temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
01020         temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
01021         temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
01022         temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
01023         temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
01024         temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
01025         temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
01026         temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
01027         temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
01028         asm volatile(\
01029             "movq (%0), %%mm0               \n\t"\
01030             "movq 8(%0), %%mm1              \n\t"\
01031             "paddw %2, %%mm0                \n\t"\
01032             "paddw %2, %%mm1                \n\t"\
01033             "psraw $5, %%mm0                \n\t"\
01034             "psraw $5, %%mm1                \n\t"\
01035             "packuswb %%mm1, %%mm0          \n\t"\
01036             OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01037             "movq 16(%0), %%mm0             \n\t"\
01038             "movq 24(%0), %%mm1             \n\t"\
01039             "paddw %2, %%mm0                \n\t"\
01040             "paddw %2, %%mm1                \n\t"\
01041             "psraw $5, %%mm0                \n\t"\
01042             "psraw $5, %%mm1                \n\t"\
01043             "packuswb %%mm1, %%mm0          \n\t"\
01044             OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
01045             :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01046             : "memory"\
01047         );\
01048         dst+=dstStride;\
01049         src+=srcStride;\
01050     }\
01051 }\
01052 \
01053 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01054     asm volatile(\
01055         "pxor %%mm7, %%mm7                \n\t"\
01056         "1:                               \n\t"\
01057         "movq  (%0), %%mm0                \n\t" /* ABCDEFGH */\
01058         "movq %%mm0, %%mm1                \n\t" /* ABCDEFGH */\
01059         "movq %%mm0, %%mm2                \n\t" /* ABCDEFGH */\
01060         "punpcklbw %%mm7, %%mm0           \n\t" /* 0A0B0C0D */\
01061         "punpckhbw %%mm7, %%mm1           \n\t" /* 0E0F0G0H */\
01062         "pshufw $0x90, %%mm0, %%mm5       \n\t" /* 0A0A0B0C */\
01063         "pshufw $0x41, %%mm0, %%mm6       \n\t" /* 0B0A0A0B */\
01064         "movq %%mm2, %%mm3                \n\t" /* ABCDEFGH */\
01065         "movq %%mm2, %%mm4                \n\t" /* ABCDEFGH */\
01066         "psllq $8, %%mm2                  \n\t" /* 0ABCDEFG */\
01067         "psllq $16, %%mm3                 \n\t" /* 00ABCDEF */\
01068         "psllq $24, %%mm4                 \n\t" /* 000ABCDE */\
01069         "punpckhbw %%mm7, %%mm2           \n\t" /* 0D0E0F0G */\
01070         "punpckhbw %%mm7, %%mm3           \n\t" /* 0C0D0E0F */\
01071         "punpckhbw %%mm7, %%mm4           \n\t" /* 0B0C0D0E */\
01072         "paddw %%mm3, %%mm5               \n\t" /* b */\
01073         "paddw %%mm2, %%mm6               \n\t" /* c */\
01074         "paddw %%mm5, %%mm5               \n\t" /* 2b */\
01075         "psubw %%mm5, %%mm6               \n\t" /* c - 2b */\
01076         "pshufw $0x06, %%mm0, %%mm5       \n\t" /* 0C0B0A0A */\
01077         "pmullw "MANGLE(ff_pw_3)", %%mm6  \n\t" /* 3c - 6b */\
01078         "paddw %%mm4, %%mm0               \n\t" /* a */\
01079         "paddw %%mm1, %%mm5               \n\t" /* d */\
01080         "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
01081         "psubw %%mm5, %%mm0               \n\t" /* 20a - d */\
01082         "paddw %5, %%mm6                  \n\t"\
01083         "paddw %%mm6, %%mm0               \n\t" /* 20a - 6b + 3c - d */\
01084         "psraw $5, %%mm0                  \n\t"\
01085         /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
01086         \
01087         "movd 5(%0), %%mm5                \n\t" /* FGHI */\
01088         "punpcklbw %%mm7, %%mm5           \n\t" /* 0F0G0H0I */\
01089         "pshufw $0xF9, %%mm5, %%mm6       \n\t" /* 0G0H0I0I */\
01090         "paddw %%mm5, %%mm1               \n\t" /* a */\
01091         "paddw %%mm6, %%mm2               \n\t" /* b */\
01092         "pshufw $0xBE, %%mm5, %%mm6       \n\t" /* 0H0I0I0H */\
01093         "pshufw $0x6F, %%mm5, %%mm5       \n\t" /* 0I0I0H0G */\
01094         "paddw %%mm6, %%mm3               \n\t" /* c */\
01095         "paddw %%mm5, %%mm4               \n\t" /* d */\
01096         "paddw %%mm2, %%mm2               \n\t" /* 2b */\
01097         "psubw %%mm2, %%mm3               \n\t" /* c - 2b */\
01098         "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
01099         "pmullw "MANGLE(ff_pw_3)", %%mm3  \n\t" /* 3c - 6b */\
01100         "psubw %%mm4, %%mm3               \n\t" /* -6b + 3c - d */\
01101         "paddw %5, %%mm1                  \n\t"\
01102         "paddw %%mm1, %%mm3               \n\t" /* 20a - 6b + 3c - d */\
01103         "psraw $5, %%mm3                  \n\t"\
01104         "packuswb %%mm3, %%mm0            \n\t"\
01105         OP_MMX2(%%mm0, (%1), %%mm4, q)\
01106         \
01107         "add %3, %0                       \n\t"\
01108         "add %4, %1                       \n\t"\
01109         "decl %2                          \n\t"\
01110         " jnz 1b                          \n\t"\
01111         : "+a"(src), "+c"(dst), "+d"(h)\
01112         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
01113         : "memory"\
01114     );\
01115 }\
01116 \
01117 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01118     int i;\
01119     int16_t temp[8];\
01120     /* quick HACK, XXX FIXME MUST be optimized */\
01121     for(i=0; i<h; i++)\
01122     {\
01123         temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01124         temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01125         temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01126         temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01127         temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01128         temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
01129         temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
01130         temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
01131         asm volatile(\
01132             "movq (%0), %%mm0           \n\t"\
01133             "movq 8(%0), %%mm1          \n\t"\
01134             "paddw %2, %%mm0            \n\t"\
01135             "paddw %2, %%mm1            \n\t"\
01136             "psraw $5, %%mm0