1 files changed, 297 insertions, 0 deletions
diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c
new file mode 100644
index 000000000..69617ca47
--- /dev/null
+++ b/vp8/encoder/x86/preproc_mmx.c
@@ -0,0 +1,297 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "memory.h"
+#include "preproc.h"
+#include "pragmas.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+#define FRAMECOUNT 7
+#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+/****************************************************************************
+*  Exported Global Variables
+****************************************************************************/
+void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_wmt
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_wmt
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
+    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int i;
+            int frame = 0;
+
+            do
+            {
+                for (i = 0; i < 8; i++)
+                {
+                    *frameptr = s[byte+i];
+                    ++frameptr;
+                }
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            for (i = 0; i < 8; i++)
+                d[byte+i] = s[byte+i];
+
+            byte += 8;
+
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int i;
+        int offset2 = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            __declspec(align(16)) unsigned short counts[8];
+            __declspec(align(16)) unsigned short sums[8];
+            __asm
+            {
+                mov         eax, offset2
+                mov         edi, s                  // source pixels
+                pxor        xmm1, xmm1              // accumulator
+
+                pxor        xmm7, xmm7
+
+                mov         esi, frameptr           // accumulator
+                pxor        xmm2, xmm2              // count
+
+                movq        xmm3, QWORD PTR [edi]
+
+                movq        QWORD PTR [esi+8*eax], xmm3
+
+                punpcklbw   xmm3, xmm2              // xmm3 source pixels
+                mov         ecx,  FRAMECOUNT
+
+                next_frame:
+                movq        xmm4, QWORD PTR [esi]   // get frame buffer values
+                punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
+                movdqa      xmm6, xmm4              // save the pixel values
+                psubsw      xmm4, xmm3              // subtracted pixel values
+                pmullw      xmm4, xmm4              // square xmm4
+                movd        xmm5, strength
+                psrlw       xmm4, xmm5              // should be strength
+                pmullw      xmm4, threes            // 3 * modifier
+                movdqa      xmm5, sixteens          // 16s
+                psubusw     xmm5, xmm4              // 16 - modifiers
+                movdqa      xmm4, xmm5              // save the modifiers
+                pmullw      xmm4, xmm6              // multiplier values
+                paddusw     xmm1, xmm4              // accumulator
+                paddusw     xmm2, xmm5              // count
+                add         esi, 8                  // next frame
+                dec         ecx                     // next set of eight pixels
+                jnz         next_frame
+
+                movdqa      counts, xmm2
+                psrlw       xmm2, 1                 // divide count by 2 for rounding
+                paddusw     xmm1, xmm2              // rounding added in
+
+                mov         frameptr, esi
+
+                movdqa      sums, xmm1
+            }
+
+            for (i = 0; i < 8; i++)
+            {
+                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+                blurvalue >>= 16;
+                d[i] = blurvalue;
+            }
+
+            s += 8;
+            d += 8;
+            byte += 8;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+    __asm emms
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_mmx
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_mmx
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
+    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int i;
+            int frame = 0;
+
+            do
+            {
+                for (i = 0; i < 4; i++)
+                {
+                    *frameptr = s[byte+i];
+                    ++frameptr;
+                }
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            for (i = 0; i < 4; i++)
+                d[byte+i] = s[byte+i];
+
+            byte += 4;
+
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int i;
+        int offset2 = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            __declspec(align(16)) unsigned short counts[8];
+            __declspec(align(16)) unsigned short sums[8];
+            __asm
+            {
+
+                mov         eax, offset2
+                mov         edi, s                  // source pixels
+                pxor        mm1, mm1                // accumulator
+                pxor        mm7, mm7
+
+                mov         esi, frameptr           // accumulator
+                pxor        mm2, mm2                // count
+
+                movd        mm3, DWORD PTR [edi]
+                movd        DWORD PTR [esi+4*eax], mm3
+
+                punpcklbw   mm3, mm2                // mm3 source pixels
+                mov         ecx,  FRAMECOUNT
+
+                next_frame:
+                movd        mm4, DWORD PTR [esi]    // get frame buffer values
+                punpcklbw   mm4, mm7                // mm4 frame buffer pixels
+                movq        mm6, mm4                // save the pixel values
+                psubsw      mm4, mm3                // subtracted pixel values
+                pmullw      mm4, mm4                // square mm4
+                movd        mm5, strength
+                psrlw       mm4, mm5                // should be strength
+                pmullw      mm4, threes             // 3 * modifier
+                movq        mm5, sixteens           // 16s
+                psubusw     mm5, mm4                // 16 - modifiers
+                movq        mm4, mm5                // save the modifiers
+                pmullw      mm4, mm6                // multiplier values
+                paddusw     mm1, mm4                // accumulator
+                paddusw     mm2, mm5                // count
+                add         esi, 4                  // next frame
+                dec         ecx                     // next set of eight pixels
+                jnz         next_frame
+
+                movq        counts, mm2
+                psrlw       mm2, 1                  // divide count by 2 for rounding
+                paddusw     mm1, mm2                // rounding added in
+
+                mov         frameptr, esi
+
+                movq        sums, mm1
+
+            }
+
+            for (i = 0; i < 4; i++)
+            {
+                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+                blurvalue >>= 16;
+                d[i] = blurvalue;
+            }
+
+            s += 4;
+            d += 4;
+            byte += 4;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+    __asm emms
+}