diff options
Diffstat (limited to 'vp8/encoder/x86/preproc_mmx.c')
-rw-r--r-- | vp8/encoder/x86/preproc_mmx.c | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c new file mode 100644 index 000000000..69617ca47 --- /dev/null +++ b/vp8/encoder/x86/preproc_mmx.c @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2010 The VP8 project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license and patent + * grant that can be found in the LICENSE file in the root of the source + * tree. All contributing project authors may be found in the AUTHORS + * file in the root of the source tree. + */ + + +#include "memory.h" +#include "preproc.h" +#include "pragmas.h" + +/**************************************************************************** +* Macros +****************************************************************************/ +#define FRAMECOUNT 7 +#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) ) + +/**************************************************************************** +* Imports +****************************************************************************/ +extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); + +/**************************************************************************** +* Exported Global Variables +****************************************************************************/ +void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength); + +/**************************************************************************** + * + * ROUTINE : temp_filter_wmt + * + * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. + * unsigned char *s : Pointer to source frame. + * unsigned char *d : Pointer to destination frame. + * int bytes : Number of bytes to filter. + * int strength : Strength of filter to apply. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs a closesness adjusted temporarl blur + * + * SPECIAL NOTES : Destination frame can be same as source frame. + * + ****************************************************************************/ +void temp_filter_wmt +( + pre_proc_instance *ppi, + unsigned char *s, + unsigned char *d, + int bytes, + int strength +) +{ + int byte = 0; + unsigned char *frameptr = ppi->frame_buffer; + + __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3}; + __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16}; + + if (ppi->frame == 0) + { + do + { + int i; + int frame = 0; + + do + { + for (i = 0; i < 8; i++) + { + *frameptr = s[byte+i]; + ++frameptr; + } + + ++frame; + } + while (frame < FRAMECOUNT); + + for (i = 0; i < 8; i++) + d[byte+i] = s[byte+i]; + + byte += 8; + + } + while (byte < bytes); + } + else + { + int i; + int offset2 = (ppi->frame % FRAMECOUNT); + + do + { + __declspec(align(16)) unsigned short counts[8]; + __declspec(align(16)) unsigned short sums[8]; + __asm + { + mov eax, offset2 + mov edi, s // source pixels + pxor xmm1, xmm1 // accumulator + + pxor xmm7, xmm7 + + mov esi, frameptr // accumulator + pxor xmm2, xmm2 // count + + movq xmm3, QWORD PTR [edi] + + movq QWORD PTR [esi+8*eax], xmm3 + + punpcklbw xmm3, xmm2 // xmm3 source pixels + mov ecx, FRAMECOUNT + + next_frame: + movq xmm4, QWORD PTR [esi] // get frame buffer values + punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels + movdqa xmm6, xmm4 // save the pixel values + psubsw xmm4, xmm3 // subtracted pixel values + pmullw xmm4, xmm4 // square xmm4 + movd xmm5, strength + psrlw xmm4, xmm5 // should be strength + pmullw xmm4, threes // 3 * modifier + movdqa xmm5, sixteens // 16s + psubusw xmm5, xmm4 // 16 - modifiers + movdqa xmm4, xmm5 // save the modifiers + pmullw xmm4, xmm6 // multiplier values + paddusw xmm1, xmm4 // accumulator + paddusw xmm2, xmm5 // count + add esi, 8 // next frame + dec ecx // next set of eight pixels + jnz next_frame + + movdqa counts, xmm2 + psrlw xmm2, 1 // divide count by 2 for rounding + paddusw xmm1, xmm2 // rounding added in + + mov frameptr, esi + + movdqa sums, xmm1 + } + + for (i = 0; i < 8; i++) + { + int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; + blurvalue >>= 16; + d[i] = blurvalue; + } + + s += 8; + d += 8; + byte += 8; + } + while (byte < bytes); + } + + ++ppi->frame; + __asm emms +} + +/**************************************************************************** + * + * ROUTINE : temp_filter_mmx + * + * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. + * unsigned char *s : Pointer to source frame. + * unsigned char *d : Pointer to destination frame. + * int bytes : Number of bytes to filter. + * int strength : Strength of filter to apply. + * + * OUTPUTS : None. + * + * RETURNS : void + * + * FUNCTION : Performs a closesness adjusted temporarl blur + * + * SPECIAL NOTES : Destination frame can be same as source frame. + * + ****************************************************************************/ +void temp_filter_mmx +( + pre_proc_instance *ppi, + unsigned char *s, + unsigned char *d, + int bytes, + int strength +) +{ + int byte = 0; + unsigned char *frameptr = ppi->frame_buffer; + + __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3}; + __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16}; + + if (ppi->frame == 0) + { + do + { + int i; + int frame = 0; + + do + { + for (i = 0; i < 4; i++) + { + *frameptr = s[byte+i]; + ++frameptr; + } + + ++frame; + } + while (frame < FRAMECOUNT); + + for (i = 0; i < 4; i++) + d[byte+i] = s[byte+i]; + + byte += 4; + + } + while (byte < bytes); + } + else + { + int i; + int offset2 = (ppi->frame % FRAMECOUNT); + + do + { + __declspec(align(16)) unsigned short counts[8]; + __declspec(align(16)) unsigned short sums[8]; + __asm + { + + mov eax, offset2 + mov edi, s // source pixels + pxor mm1, mm1 // accumulator + pxor mm7, mm7 + + mov esi, frameptr // accumulator + pxor mm2, mm2 // count + + movd mm3, DWORD PTR [edi] + movd DWORD PTR [esi+4*eax], mm3 + + punpcklbw mm3, mm2 // mm3 source pixels + mov ecx, FRAMECOUNT + + next_frame: + movd mm4, DWORD PTR [esi] // get frame buffer values + punpcklbw mm4, mm7 // mm4 frame buffer pixels + movq mm6, mm4 // save the pixel values + psubsw mm4, mm3 // subtracted pixel values + pmullw mm4, mm4 // square mm4 + movd mm5, strength + psrlw mm4, mm5 // should be strength + pmullw mm4, threes // 3 * modifier + movq mm5, sixteens // 16s + psubusw mm5, mm4 // 16 - modifiers + movq mm4, mm5 // save the modifiers + pmullw mm4, mm6 // multiplier values + paddusw mm1, mm4 // accumulator + paddusw mm2, mm5 // count + add esi, 4 // next frame + dec ecx // next set of eight pixels + jnz next_frame + + movq counts, mm2 + psrlw mm2, 1 // divide count by 2 for rounding + paddusw mm1, mm2 // rounding added in + + mov frameptr, esi + + movq sums, mm1 + + } + + for (i = 0; i < 4; i++) + { + int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; + blurvalue >>= 16; + d[i] = blurvalue; + } + + s += 4; + d += 4; + byte += 4; + } + while (byte < bytes); + } + + ++ppi->frame; + __asm emms +} |