summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNemanja Lukic <nemanja.lukic@rt-rk.com>2012-10-14 11:58:51 +0200
committerSøren Sandmann Pedersen <ssp@redhat.com>2012-10-25 10:04:30 -0400
commitca83717c63813b6f53f89dd94b5771bd32382a18 (patch)
treec020bebfb946112086076f7b7146fc8ec4e90eeb
parent52d20e692ebc605077448ab6f52fd257f83481b2 (diff)
downloadpixman-ca83717c63813b6f53f89dd94b5771bd32382a18.tar.gz
MIPS: DSPr2: Added more fast-paths for ADD operation: - add_0565_8_0565 - add_8888_8_8888 - add_8888_n_8888
Performance numbers before/after on MIPS-74kc @ 1GHz: lowlevel-blt-bench results Referent (before): add_0565_8_0565 = L1: 8.89 L2: 8.37 M: 7.35 ( 29.22%) HT: 5.90 VT: 5.85 R: 5.67 RT: 3.31 ( 26Kops/s) add_8888_8_8888 = L1: 17.22 L2: 14.17 M: 9.89 ( 65.56%) HT: 7.57 VT: 7.50 R: 7.36 RT: 4.10 ( 30Kops/s) add_8888_n_8888 = L1: 17.79 L2: 14.87 M: 10.35 ( 54.89%) HT: 5.19 VT: 4.93 R: 4.92 RT: 1.90 ( 19Kops/s) Optimized: add_0565_8_0565 = L1: 21.72 L2: 20.01 M: 14.96 ( 59.54%) HT: 12.03 VT: 11.81 R: 11.26 RT: 6.33 ( 37Kops/s) add_8888_8_8888 = L1: 47.42 L2: 38.64 M: 15.90 (105.48%) HT: 13.34 VT: 13.03 R: 11.84 RT: 6.63 ( 38Kops/s) add_8888_n_8888 = L1: 54.83 L2: 42.66 M: 17.36 ( 92.11%) HT: 15.20 VT: 14.82 R: 13.66 RT: 7.83 ( 41Kops/s)
-rw-r--r--pixman/pixman-mips-dspr2-asm.S170
-rw-r--r--pixman/pixman-mips-dspr2.c12
2 files changed, 182 insertions, 0 deletions
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 614c628..7c8ca30 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -1461,6 +1461,176 @@ LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8888_asm_mips)
END(pixman_composite_add_n_8_8888_asm_mips)
+LEAF_MIPS_DSPR2(pixman_composite_add_0565_8_0565_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (r5g6b5)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+ li t4, 0xf800f800
+ li t5, 0x07e007e0
+ li t6, 0x001F001F
+ li t7, 0x00ff00ff
+ beqz a3, 3f
+ nop
+ addiu t1, a3, -1
+ beqz t1, 2f
+ nop
+1:
+ lhu t0, 0(a1) /* t0 = source (r5g6b5) */
+ lhu t1, 2(a1) /* t1 = source (r5g6b5) */
+ lbu t2, 0(a2) /* t2 = mask (a8) */
+ lbu t3, 1(a2) /* t3 = mask (a8) */
+ lhu t8, 0(a0) /* t8 = destination (r5g6b5) */
+ lhu t9, 2(a0) /* t9 = destination (r5g6b5) */
+ addiu a1, a1, 4
+ addiu a2, a2, 2
+
+ CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
+ CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, s6, s7
+ MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 s0, s1, \
+ t2, t3, \
+ s2, s3, \
+ t0, t1, \
+ t7, s4, s5, s6, s7, t8, t9
+ CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3
+
+ sh s0, 0(a0)
+ sh s1, 2(a0)
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 1b
+ addiu a0, a0, 4
+2:
+ beqz a3, 3f
+ nop
+ lhu t0, 0(a1) /* t0 = source (r5g6b5) */
+ lbu t1, 0(a2) /* t1 = mask (a8) */
+ lhu t2, 0(a0) /* t2 = destination (r5g6b5) */
+
+ CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5
+ CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6
+ MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t3, t1, t4, t0, t7, t2, t5, t6
+ CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
+
+ sh t3, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+ j ra
+ nop
+
+END(pixman_composite_add_0565_8_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_8_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (a8r8g8b8)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 0, s0, s1, s2
+ li t4, 0x00ff00ff
+ beqz a3, 3f
+ nop
+ addiu t1, a3, -1
+ beqz t1, 2f
+ nop
+1:
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ lw t1, 4(a1) /* t1 = source (a8r8g8b8) */
+ lbu t2, 0(a2) /* t2 = mask (a8) */
+ lbu t3, 1(a2) /* t3 = mask (a8) */
+ lw t5, 0(a0) /* t5 = destination (a8r8g8b8) */
+ lw t6, 4(a0) /* t6 = destination (a8r8g8b8) */
+ addiu a1, a1, 8
+ addiu a2, a2, 2
+
+ MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
+ t2, t3, \
+ t5, t6, \
+ t7, t8, \
+ t4, t9, s0, s1, s2, t0, t1
+
+ sw t7, 0(a0)
+ sw t8, 4(a0)
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 1b
+ addiu a0, a0, 8
+2:
+ beqz a3, 3f
+ nop
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ lbu t1, 0(a2) /* t1 = mask (a8) */
+ lw t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+
+ MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t3, t4, t5, t6, t7
+
+ sw t3, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+ j ra
+ nop
+
+END(pixman_composite_add_8888_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_n_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (a8r8g8b8)
+ * a2 - mask (32bit constant)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 0, s0, s1, s2
+ li t4, 0x00ff00ff
+ beqz a3, 3f
+ nop
+ srl a2, a2, 24
+ addiu t1, a3, -1
+ beqz t1, 2f
+ nop
+1:
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ lw t1, 4(a1) /* t1 = source (a8r8g8b8) */
+ /* a2 = mask (32bit constant) */
+ lw t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+ lw t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+ addiu a1, a1, 8
+
+ MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
+ a2, a2, \
+ t2, t3, \
+ t5, t6, \
+ t4, t7, t8, t9, s0, s1, s2
+
+ sw t5, 0(a0)
+ sw t6, 4(a0)
+ addiu a3, a3, -2
+ addiu t1, a3, -1
+ bgtz t1, 1b
+ addiu a0, a0, 8
+2:
+ beqz a3, 3f
+ nop
+ lw t0, 0(a1) /* t0 = source (a8r8g8b8) */
+ /* a2 = mask (32bit constant) */
+ lw t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+
+ MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, a2, t1, t3, t4, t5, t6, t7
+
+ sw t3, 0(a0)
+3:
+ RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+ j ra
+ nop
+
+END(pixman_composite_add_8888_n_8888_asm_mips)
+
LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
/*
* a0 - *dst
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 30d2a85..1471750 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -70,9 +70,15 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_0565,
uint32_t, 1, uint16_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_0565_n_0565,
uint16_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, add_8888_n_8888,
+ uint32_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t, 1,
uint8_t, 1, uint8_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8_8888, uint32_t, 1,
+ uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_0565_8_0565, uint16_t, 1,
+ uint8_t, 1, uint16_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_8888, uint32_t, 1,
uint8_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1,
@@ -281,6 +287,12 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, mips_composite_add_n_8_8888),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, mips_composite_add_n_8_8888),
PIXMAN_STD_FAST_PATH (ADD, a8, a8, a8, mips_composite_add_8_8_8),
+ PIXMAN_STD_FAST_PATH (ADD, r5g6b5, a8, r5g6b5, mips_composite_add_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (ADD, b5g6r5, a8, b5g6r5, mips_composite_add_0565_8_0565),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, a8, a8r8g8b8, mips_composite_add_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, a8, a8b8g8r8, mips_composite_add_8888_8_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, solid, a8r8g8b8, mips_composite_add_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, solid, a8b8g8r8, mips_composite_add_8888_n_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),