summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYaowu Xu <yaowu@google.com>2010-06-16 12:52:18 -0700
committerYaowu Xu <yaowu@google.com>2010-06-24 13:17:58 -0700
commitd0dd01b8ce8bc5f477d70f1c127d795418c5efb5 (patch)
tree335898f9122085c141c49d44182b52f580100bc9
parenta5906668a32c46a0f033b3a503ac5a159b77fce3 (diff)
downloadlibvpx-d0dd01b8ce8bc5f477d70f1c127d795418c5efb5.tar.gz
Redo the forward 4x4 dct
The new fdct lowers the round trip sum squared error for a 4x4 block ~0.12. or ~0.008/pixel. For reference, the old matrix multiply version has average round trip error 1.46 for a 4x4 block. Thanks to "derf" for his suggestions and references. Change-Id: I5559d1e81d333b319404ab16b336b739f87afc79
-rw-r--r--vp8/encoder/block.h5
-rw-r--r--vp8/encoder/dct.c141
-rw-r--r--vp8/encoder/dct.h10
-rw-r--r--vp8/encoder/encodeintra.c2
-rw-r--r--vp8/encoder/encodemb.c39
-rw-r--r--vp8/encoder/ethreading.c3
-rw-r--r--vp8/encoder/generic/csystemdependent.c4
-rw-r--r--vp8/encoder/onyx_if.c6
-rw-r--r--vp8/encoder/rdopt.c4
-rw-r--r--vp8/encoder/x86/csystemdependent.c26
-rw-r--r--vp8/encoder/x86/dct_mmx.asm392
-rw-r--r--vp8/encoder/x86/dct_x86.h13
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c34
-rw-r--r--vp8/vp8cx.mk1
14 files changed, 118 insertions, 562 deletions
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index c1fcfe29a..b55bc51cb 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -100,14 +100,9 @@ typedef struct
void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
- void (*short_fdct4x4rd)(short *input, short *output, int pitch);
- void (*short_fdct8x4rd)(short *input, short *output, int pitch);
void (*short_walsh4x4)(short *input, short *output, int pitch);
-
void (*quantize_b)(BLOCK *b, BLOCKD *d);
-
-
} MACROBLOCK;
diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
index 3075e5853..58e36109c 100644
--- a/vp8/encoder/dct.c
+++ b/vp8/encoder/dct.c
@@ -11,163 +11,54 @@
#include <math.h>
-
-static const short dct_matrix2[4][4] =
-{
- { 23170, 30274, 23170, 12540 },
- { 23170, 12540, -23170, -30274 },
- { 23170, -12540, -23170, 30274 },
- { 23170, -30274, 23170, -12540 }
-};
-
-static const short dct_matrix1[4][4] =
-{
- { 23170, 23170, 23170, 23170 },
- { 30274, 12540, -12540, -30274 },
- { 23170, -23170, -23170, 23170 },
- { 12540, -30274, 30274, -12540 }
-};
-
-
-#define _1STSTAGESHIFT 14
-#define _1STSTAGEROUNDING (1<<( _1STSTAGESHIFT-1))
-#define _2NDSTAGESHIFT 16
-#define _2NDSTAGEROUNDING (1<<( _2NDSTAGESHIFT-1))
-
-// using matrix multiply
void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
{
- int i, j, k;
- short temp[4][4];
- int sumtemp;
- pitch >>= 1;
-
- for (i = 0; i < 4; i++)
- {
- for (j = 0; j < 4; j++)
- {
- sumtemp = 0;
-
- for (k = 0; k < 4; k++)
- {
- sumtemp += input[i*pitch+k] * dct_matrix2[k][j];
-
- }
-
- temp[i][j] = (short)((sumtemp + _1STSTAGEROUNDING) >> _1STSTAGESHIFT);
- }
- }
-
-
- for (i = 0; i < 4; i++)
- {
- for (j = 0; j < 4; j++)
- {
- sumtemp = 0;
-
- for (k = 0; k < 4; k++)
- {
- sumtemp += dct_matrix1[i][ k] * temp[k][ j];
- }
-
- output[i*4+j] = (short)((sumtemp + _2NDSTAGEROUNDING) >> _2NDSTAGESHIFT);
- }
- }
-
-}
-
-
-void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
-{
- vp8_short_fdct4x4_c(input, output, pitch);
- vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
-}
-
-
-static const signed short x_c1 = 60547;
-static const signed short x_c2 = 46341;
-static const signed short x_c3 = 25080;
-
-void vp8_fast_fdct4x4_c(short *input, short *output, int pitch)
-{
int i;
int a1, b1, c1, d1;
- int a2, b2, c2, d2;
short *ip = input;
-
short *op = output;
- int temp1, temp2;
for (i = 0; i < 4; i++)
{
- a1 = (ip[0] + ip[3]) * 2;
- b1 = (ip[1] + ip[2]) * 2;
- c1 = (ip[1] - ip[2]) * 2;
- d1 = (ip[0] - ip[3]) * 2;
-
- temp1 = a1 + b1;
- temp2 = a1 - b1;
-
- op[0] = ((temp1 * x_c2) >> 16) + temp1;
- op[2] = ((temp2 * x_c2) >> 16) + temp2;
-
- temp1 = (c1 * x_c3) >> 16;
- temp2 = ((d1 * x_c1) >> 16) + d1;
+ a1 = ((ip[0] + ip[3])<<3);
+ b1 = ((ip[1] + ip[2])<<3);
+ c1 = ((ip[1] - ip[2])<<3);
+ d1 = ((ip[0] - ip[3])<<3);
- op[1] = temp1 + temp2;
-
- temp1 = (d1 * x_c3) >> 16;
- temp2 = ((c1 * x_c1) >> 16) + c1;
+ op[0] = a1 + b1;
+ op[2] = a1 - b1;
- op[3] = temp1 - temp2;
+ op[1] = (c1 * 2217 + d1 * 5352 + 14500)>>12;
+ op[3] = (d1 * 2217 - c1 * 5352 + 7500)>>12;
ip += pitch / 2;
op += 4;
- }
+ }
ip = output;
op = output;
-
for (i = 0; i < 4; i++)
{
-
a1 = ip[0] + ip[12];
b1 = ip[4] + ip[8];
c1 = ip[4] - ip[8];
d1 = ip[0] - ip[12];
+ op[0] = ( a1 + b1 + 7)>>4;
+ op[8] = ( a1 - b1 + 7)>>4;
- temp1 = a1 + b1;
- temp2 = a1 - b1;
-
- a2 = ((temp1 * x_c2) >> 16) + temp1;
- c2 = ((temp2 * x_c2) >> 16) + temp2;
-
- temp1 = (c1 * x_c3) >> 16;
- temp2 = ((d1 * x_c1) >> 16) + d1;
-
- b2 = temp1 + temp2;
-
- temp1 = (d1 * x_c3) >> 16;
- temp2 = ((c1 * x_c1) >> 16) + c1;
-
- d2 = temp1 - temp2;
-
-
- op[0] = (a2 + 1) >> 1;
- op[4] = (b2 + 1) >> 1;
- op[8] = (c2 + 1) >> 1;
- op[12] = (d2 + 1) >> 1;
+ op[4] =((c1 * 2217 + d1 * 5352 + 12000)>>16) + (d1!=0);
+ op[12] = (d1 * 2217 - c1 * 5352 + 51000)>>16;
ip++;
op++;
}
}
-void vp8_fast_fdct8x4_c(short *input, short *output, int pitch)
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
{
- vp8_fast_fdct4x4_c(input, output, pitch);
- vp8_fast_fdct4x4_c(input + 4, output + 16, pitch);
+ vp8_short_fdct4x4_c(input, output, pitch);
+ vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
}
void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
index f79dba4f2..0ab40b310 100644
--- a/vp8/encoder/dct.h
+++ b/vp8/encoder/dct.h
@@ -32,16 +32,6 @@ extern prototype_fdct(vp8_fdct_short4x4);
#endif
extern prototype_fdct(vp8_fdct_short8x4);
-#ifndef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_c
-#endif
-extern prototype_fdct(vp8_fdct_fast4x4);
-
-#ifndef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_c
-#endif
-extern prototype_fdct(vp8_fdct_fast8x4);
-
#ifndef vp8_fdct_walsh_short4x4
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_c
#endif
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 0e160930d..870cb5815 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -66,7 +66,7 @@ void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BL
ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
- x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+ x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
x->quantize_b(be, b);
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 824850c41..8bc01df5b 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -130,7 +130,8 @@ void vp8_transform_mbuvrd(MACROBLOCK *x)
for (i = 16; i < 24; i += 2)
{
- x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
}
}
@@ -140,14 +141,16 @@ void vp8_transform_intra_mby(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
- x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
vp8_build_dcblock(x);
// do 2nd order transform on the dc block
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
@@ -157,14 +160,16 @@ void vp8_transform_intra_mbyrd(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
- x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
vp8_build_dcblock(x);
// do 2nd order transform on the dc block
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
void vp8_transform_mb(MACROBLOCK *x)
@@ -173,7 +178,8 @@ void vp8_transform_mb(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
- x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
@@ -182,12 +188,14 @@ void vp8_transform_mb(MACROBLOCK *x)
for (i = 16; i < 24; i += 2)
{
- x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
}
// do 2nd order transform on the dc block
if (x->e_mbd.mbmi.mode != SPLITMV)
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
@@ -197,14 +205,16 @@ void vp8_transform_mby(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
- x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
if (x->e_mbd.mbmi.mode != SPLITMV)
{
vp8_build_dcblock(x);
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
}
@@ -214,7 +224,8 @@ void vp8_transform_mbrd(MACROBLOCK *x)
for (i = 0; i < 16; i += 2)
{
- x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 32);
}
// build dc block from 16 y dc values
@@ -223,12 +234,14 @@ void vp8_transform_mbrd(MACROBLOCK *x)
for (i = 16; i < 24; i += 2)
{
- x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+ x->vp8_short_fdct8x4(&x->block[i].src_diff[0],
+ &x->block[i].coeff[0], 16);
}
// do 2nd order transform on the dc block
if (x->e_mbd.mbmi.mode != SPLITMV)
- x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+ x->short_walsh4x4(&x->block[24].src_diff[0],
+ &x->block[24].coeff[0], 8);
}
void vp8_stuff_inter16x16(MACROBLOCK *x)
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index a205667dc..dd98a09d1 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -257,9 +257,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
z->vp8_short_fdct4x4 = x->vp8_short_fdct4x4;
z->vp8_short_fdct8x4 = x->vp8_short_fdct8x4;
- z->short_fdct4x4rd = x->short_fdct4x4rd;
- z->short_fdct8x4rd = x->short_fdct8x4rd;
- z->short_fdct8x4rd = x->short_fdct8x4rd;
z->short_walsh4x4 = x->short_walsh4x4;
z->quantize_b = x->quantize_b;
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index e68d65025..dd89f1a82 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -68,8 +68,8 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
- cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c;
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
cpi->rtcd.encodemb.berr = vp8_block_error_c;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index f3456a733..60d807c03 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -137,8 +137,6 @@ extern unsigned int inter_b_modes[15];
extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
extern void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-extern void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-extern void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
@@ -1136,15 +1134,11 @@ void vp8_set_speed_features(VP8_COMP *cpi)
{
cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
- cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
- cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
}
else
{
cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
- cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
- cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
}
cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4);
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 2d6dee139..70cf122fa 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1028,7 +1028,7 @@ static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels
vp8_build_inter_predictors_b(bd, 16, x->e_mbd.subpixel_predict);
ENCODEMB_INVOKE(rtcd, subb)(be, bd, 16);
- x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+ x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
// set to 0 no way to account for 2nd order DC so discount
//be->coeff[0] = 0;
@@ -1056,7 +1056,7 @@ static void macro_block_yrd(MACROBLOCK *mb, int *Rate, int *Distortion, const vp
// Fdct and building the 2nd order block
for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
{
- mb->short_fdct8x4rd(beptr->src_diff, beptr->coeff, 32);
+ mb->vp8_short_fdct8x4(beptr->src_diff, beptr->coeff, 32);
*Y2DCPtr++ = beptr->coeff[0];
*Y2DCPtr++ = beptr->coeff[16];
}
diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
index 6aeac508f..bf12fee54 100644
--- a/vp8/encoder/x86/csystemdependent.c
+++ b/vp8/encoder/x86/csystemdependent.c
@@ -181,10 +181,17 @@ void vp8_cmachine_specific_config(void)
// Willamette instruction set available:
vp8_mbuverror = vp8_mbuverror_xmm;
vp8_fast_quantize_b = vp8_fast_quantize_b_sse;
+#if 0 //new fdct
vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
- vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
- vp8_fast_fdct8x4 = vp8_fast_fdct8x4_wmt;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_mmx;
+ vp8_fast_fdct8x4 = vp8_short_fdct8x4_wmt;
+#else
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
+#endif
vp8_subtract_b = vp8_subtract_b_mmx;
vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
vp8_variance4x4 = vp8_variance4x4_mmx;
@@ -218,10 +225,17 @@ void vp8_cmachine_specific_config(void)
// MMX instruction set available:
vp8_mbuverror = vp8_mbuverror_mmx;
vp8_fast_quantize_b = vp8_fast_quantize_b_mmx;
+#if 0 // new fdct
vp8_short_fdct4x4 = vp8_short_fdct4x4_mmx;
vp8_short_fdct8x4 = vp8_short_fdct8x4_mmx;
- vp8_fast_fdct4x4 = vp8_fast_fdct4x4_mmx;
- vp8_fast_fdct8x4 = vp8_fast_fdct8x4_mmx;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_mmx;
+ vp8_fast_fdct8x4 = vp8_short_fdct8x4_mmx;
+#else
+ vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_c;
+ vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
+#endif
vp8_subtract_b = vp8_subtract_b_mmx;
vp8_subtract_mbuv = vp8_subtract_mbuv_mmx;
vp8_variance4x4 = vp8_variance4x4_mmx;
@@ -254,10 +268,10 @@ void vp8_cmachine_specific_config(void)
{
// Pure C:
vp8_mbuverror = vp8_mbuverror_c;
- vp8_fast_quantize_b = vp8_fast_quantize_b_c;
+ vp8_fast_quantize_b = vp8_fast_quantize_b_c;
vp8_short_fdct4x4 = vp8_short_fdct4x4_c;
vp8_short_fdct8x4 = vp8_short_fdct8x4_c;
- vp8_fast_fdct4x4 = vp8_fast_fdct4x4_c;
+ vp8_fast_fdct4x4 = vp8_short_fdct4x4_c;
vp8_fast_fdct8x4 = vp8_fast_fdct8x4_c;
vp8_subtract_b = vp8_subtract_b_c;
vp8_subtract_mbuv = vp8_subtract_mbuv_c;
diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
index 32d6610aa..ff96c49f3 100644
--- a/vp8/encoder/x86/dct_mmx.asm
+++ b/vp8/encoder/x86/dct_mmx.asm
@@ -13,8 +13,7 @@
section .text
global sym(vp8_short_fdct4x4_mmx)
- global sym(vp8_fast_fdct4x4_mmx)
- global sym(vp8_fast_fdct8x4_wmt)
+ global sym(vp8_short_fdct8x4_wmt)
%define DCTCONSTANTSBITS (16)
@@ -24,10 +23,6 @@ section .text
%define x_c3 (25080) ; cos(pi*3/8) * (1<<15)
-%define _1STSTAGESHIFT 14
-%define _2NDSTAGESHIFT 16
-
-; using matrix multiply with source and destbuffer has a pitch
;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
sym(vp8_short_fdct4x4_mmx):
push rbp
@@ -37,333 +32,6 @@ sym(vp8_short_fdct4x4_mmx):
push rsi
push rdi
; end prolog
-
- mov rsi, arg(0) ;input
- mov rdi, arg(1) ;output
-
- movsxd rax, dword ptr arg(2) ;pitch
- lea rdx, [dct_matrix GLOBAL]
-
- movq mm0, [rsi ]
- movq mm1, [rsi + rax]
-
- movq mm2, [rsi + rax*2]
- lea rsi, [rsi + rax*2]
-
- movq mm3, [rsi + rax]
-
- ; first column
- movq mm4, mm0
- movq mm7, [rdx]
-
- pmaddwd mm4, mm7
- movq mm5, mm1
-
- pmaddwd mm5, mm7
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
-
- pmaddwd mm5, mm7
- movq mm6, mm3
-
- pmaddwd mm6, mm7
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _1STSTAGESHIFT
- psrad mm5, _1STSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi], mm4
-
- ;second column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+8]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+8]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _1STSTAGESHIFT
- psrad mm5, _1STSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+8], mm4
-
-
- ;third column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+16]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+16]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _1STSTAGESHIFT
- psrad mm5, _1STSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+16], mm4
-
- ;fourth column (this is the last column, so we do not have save the source any more)
-
- pmaddwd mm0, [rdx+24]
-
- pmaddwd mm1, [rdx+24]
- movq mm6, mm0
-
- punpckldq mm0, mm1
- punpckhdq mm6, mm1
-
- paddd mm0, mm6
-
- pmaddwd mm2, [rdx+24]
-
- pmaddwd mm3, [rdx+24]
- movq mm7, mm2
-
- punpckldq mm2, mm3
- punpckhdq mm7, mm3
-
- paddd mm2, mm7
- movq mm6, [dct1st_stage_rounding_mmx GLOBAL]
-
- paddd mm0, mm6
- paddd mm2, mm6
-
- psrad mm0, _1STSTAGESHIFT
- psrad mm2, _1STSTAGESHIFT
-
- packssdw mm0, mm2
-
- movq mm3, mm0
-
- ; done with one pass
- ; now start second pass
- movq mm0, [rdi ]
- movq mm1, [rdi+ 8]
- movq mm2, [rdi+ 16]
-
- movq mm4, mm0
-
- pmaddwd mm4, [rdx]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi], mm4
-
- ;second column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+8]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+8]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+8]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+8], mm4
-
-
- ;third column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+16]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+16]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+16]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+16], mm4
-
- ;fourth column
- movq mm4, mm0
-
- pmaddwd mm4, [rdx+24]
- movq mm5, mm1
-
- pmaddwd mm5, [rdx+24]
- movq mm6, mm4
-
- punpckldq mm4, mm5
- punpckhdq mm6, mm5
-
- paddd mm4, mm6
- movq mm5, mm2
-
- pmaddwd mm5, [rdx+24]
- movq mm6, mm3
-
- pmaddwd mm6, [rdx+24]
- movq mm7, mm5
-
- punpckldq mm5, mm6
- punpckhdq mm7, mm6
-
- paddd mm5, mm7
- movq mm6, [dct2nd_stage_rounding_mmx GLOBAL]
-
- paddd mm4, mm6
- paddd mm5, mm6
-
- psrad mm4, _2NDSTAGESHIFT
- psrad mm5, _2NDSTAGESHIFT
-
- packssdw mm4, mm5
- movq [rdi+24], mm4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
-sym(vp8_fast_fdct4x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
mov rsi, arg(0) ;input
mov rdi, arg(1) ;output
@@ -379,11 +47,11 @@ sym(vp8_fast_fdct4x4_mmx):
movq mm3, [rcx + rax]
; get the constants
;shift to left by 1 for prescision
- paddw mm0, mm0
- paddw mm1, mm1
+ psllw mm0, 3
+ psllw mm1, 3
- psllw mm2, 1
- psllw mm3, 1
+ psllw mm2, 3
+ psllw mm3, 3
; transpose for the second stage
movq mm4, mm0 ; 00 01 02 03
@@ -531,20 +199,23 @@ sym(vp8_fast_fdct4x4_mmx):
movq mm3, mm5
; done with vertical
- pcmpeqw mm4, mm4
- pcmpeqw mm5, mm5
- psrlw mm4, 15
- psrlw mm5, 15
+ pcmpeqw mm4, mm4
+ pcmpeqw mm5, mm5
+ psrlw mm4, 15
+ psrlw mm5, 15
+
+ psllw mm4, 2
+ psllw mm5, 2
paddw mm0, mm4
paddw mm1, mm5
paddw mm2, mm4
paddw mm3, mm5
- psraw mm0, 1
- psraw mm1, 1
- psraw mm2, 1
- psraw mm3, 1
+ psraw mm0, 3
+ psraw mm1, 3
+ psraw mm2, 3
+ psraw mm3, 3
movq [rdi ], mm0
movq [rdi+ 8], mm1
@@ -560,8 +231,8 @@ sym(vp8_fast_fdct4x4_mmx):
ret
-;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
-sym(vp8_fast_fdct8x4_wmt):
+;void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_short_fdct8x4_wmt):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 3
@@ -584,11 +255,11 @@ sym(vp8_fast_fdct8x4_wmt):
movdqa xmm3, [rcx + rax]
; get the constants
;shift to left by 1 for prescision
- psllw xmm0, 1
- psllw xmm2, 1
+ psllw xmm0, 3
+ psllw xmm2, 3
- psllw xmm4, 1
- psllw xmm3, 1
+ psllw xmm4, 3
+ psllw xmm3, 3
; transpose for the second stage
movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07
@@ -758,20 +429,23 @@ sym(vp8_fast_fdct8x4_wmt):
; done with vertical
- pcmpeqw xmm4, xmm4
- pcmpeqw xmm5, xmm5;
- psrlw xmm4, 15
- psrlw xmm5, 15
+ pcmpeqw xmm4, xmm4
+ pcmpeqw xmm5, xmm5;
+ psrlw xmm4, 15
+ psrlw xmm5, 15
+
+ psllw xmm4, 2
+ psllw xmm5, 2
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm4
paddw xmm3, xmm5
- psraw xmm0, 1
- psraw xmm1, 1
- psraw xmm2, 1
- psraw xmm3, 1
+ psraw xmm0, 3
+ psraw xmm1, 3
+ psraw xmm2, 3
+ psraw xmm3, 3
movq QWORD PTR[rdi ], xmm0
movq QWORD PTR[rdi+ 8], xmm1
diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
index 05d018043..ada16d34f 100644
--- a/vp8/encoder/x86/dct_x86.h
+++ b/vp8/encoder/x86/dct_x86.h
@@ -22,31 +22,22 @@
#if HAVE_MMX
extern prototype_fdct(vp8_short_fdct4x4_mmx);
extern prototype_fdct(vp8_short_fdct8x4_mmx);
-extern prototype_fdct(vp8_fast_fdct4x4_mmx);
-extern prototype_fdct(vp8_fast_fdct8x4_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
+#if 0 new c version,
#undef vp8_fdct_short4x4
#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
#undef vp8_fdct_short8x4
#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
-
-#undef vp8_fdct_fast4x4
-#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx
-
-#undef vp8_fdct_fast8x4
-#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx
+#endif
#endif
#endif
#if HAVE_SSE2
-extern prototype_fdct(vp8_short_fdct4x4_wmt);
extern prototype_fdct(vp8_short_fdct8x4_wmt);
-extern prototype_fdct(vp8_fast_fdct8x4_wmt);
-
extern prototype_fdct(vp8_short_walsh4x4_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index f3750455b..0fb82e60e 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -18,15 +18,10 @@
#if HAVE_MMX
void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
{
- vp8_short_fdct4x4_mmx(input, output, pitch);
- vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+ vp8_short_fdct4x4_c(input, output, pitch);
+ vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
}
-void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch)
-{
- vp8_fast_fdct4x4_mmx(input, output , pitch);
- vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr, short *dequant_ptr,
@@ -87,11 +82,6 @@ void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
#endif
#if HAVE_SSE2
-void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
-{
- vp8_short_fdct4x4_wmt(input, output, pitch);
- vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch);
-}
int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
short *qcoeff_ptr, short *dequant_ptr,
@@ -221,11 +211,19 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.get8x8var = vp8_get8x8var_mmx;
cpi->rtcd.variance.get16x16var = vp8_get16x16var_mmx;
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_mmx;
-
+#if 0 // new fdct
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_mmx;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_mmx;
- cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_mmx;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_mmx;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_mmx;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_mmx;
+#else
+ cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
+ cpi->rtcd.fdct.fast4x4 = vp8_short_fdct4x4_c;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_c;
+
+#endif
+
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_c;
cpi->rtcd.encodemb.berr = vp8_block_error_mmx;
@@ -270,13 +268,13 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2;
/* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
-#if 0
+#if 0 //new fdct
/* short SSE2 DCT currently disabled, does not match the MMX version */
cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_wmt;
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_wmt;
-#endif
/* cpi->rtcd.fdct.fast4x4 not implemented for wmt */;
- cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_wmt;
+ cpi->rtcd.fdct.fast8x4 = vp8_short_fdct8x4_wmt;
+#endif
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_sse2;
cpi->rtcd.encodemb.berr = vp8_block_error_xmm;
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index f09f25852..f86a0b2aa 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -96,7 +96,6 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_sse2.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/variance_impl_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm