diff options
author | bernds <bernds@138bc75d-0d04-0410-961f-82ee72b054a4> | 2002-05-04 17:06:56 +0000 |
---|---|---|
committer | bernds <bernds@138bc75d-0d04-0410-961f-82ee72b054a4> | 2002-05-04 17:06:56 +0000 |
commit | d3ceaee1b851570b269f8533b5690726512c1dfc (patch) | |
tree | 6b703160d494e8e275d928ede8cc050c11f41a6f /gcc/config/i386 | |
parent | bc70bd5ef2821bfa2f4bf0507adc7349acf52bd4 (diff) | |
download | gcc-d3ceaee1b851570b269f8533b5690726512c1dfc.tar.gz |
Fix bugs in SSE2 suppport and add SSE2 functions to xmmintrin.h
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@53161 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/i386')
-rw-r--r-- | gcc/config/i386/i386.c | 117 | ||||
-rw-r--r-- | gcc/config/i386/i386.md | 166 | ||||
-rw-r--r-- | gcc/config/i386/xmmintrin.h | 923 |
3 files changed, 1160 insertions, 46 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index b9ca483e9c9..bc4cf7b589b 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -11179,10 +11179,10 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE2, CODE_FOR_sse2_umulsidi3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_andti3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_nandti3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_iorti3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_xorti3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_nandv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, 0, 0 }, @@ -11206,6 +11206,34 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE2, CODE_FOR_sse2_punpcklwd, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_punpckldq, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, 0, 0 }, + + { MASK_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_psadbw, 0, IX86_BUILTIN_PSADBW128, 0, 0 }, + + { MASK_SSE2, CODE_FOR_ashlv8hi3_ti, 0, IX86_BUILTIN_PSLLW128, 0, 0 }, + { MASK_SSE2, CODE_FOR_ashlv8hi3, 0, IX86_BUILTIN_PSLLWI128, 0, 0 }, + { MASK_SSE2, CODE_FOR_ashlv4si3_ti, 0, IX86_BUILTIN_PSLLD128, 0, 0 }, + { MASK_SSE2, CODE_FOR_ashlv4si3, 0, IX86_BUILTIN_PSLLDI128, 0, 0 }, + { MASK_SSE2, CODE_FOR_ashlv2di3_ti, 0, IX86_BUILTIN_PSLLQ128, 0, 0 }, + { MASK_SSE2, CODE_FOR_ashlv2di3, 0, IX86_BUILTIN_PSLLQI128, 0, 0 }, + + { MASK_SSE2, CODE_FOR_lshrv8hi3_ti, 0, IX86_BUILTIN_PSRLW128, 0, 0 }, + { MASK_SSE2, CODE_FOR_lshrv8hi3, 0, IX86_BUILTIN_PSRLWI128, 0, 0 }, + { MASK_SSE2, CODE_FOR_lshrv4si3_ti, 0, IX86_BUILTIN_PSRLD128, 0, 0 }, + { MASK_SSE2, CODE_FOR_lshrv4si3, 0, IX86_BUILTIN_PSRLDI128, 0, 0 }, + { MASK_SSE2, CODE_FOR_lshrv2di3_ti, 0, IX86_BUILTIN_PSRLQ128, 0, 0 }, + { MASK_SSE2, CODE_FOR_lshrv2di3, 0, IX86_BUILTIN_PSRLQI128, 0, 0 }, + + { MASK_SSE2, CODE_FOR_ashrv8hi3_ti, 0, IX86_BUILTIN_PSRAW128, 0, 0 }, + { MASK_SSE2, CODE_FOR_ashrv8hi3, 0, IX86_BUILTIN_PSRAWI128, 0, 0 }, + { MASK_SSE2, CODE_FOR_ashrv4si3_ti, 0, IX86_BUILTIN_PSRAD128, 0, 0 }, + { MASK_SSE2, CODE_FOR_ashrv4si3, 0, IX86_BUILTIN_PSRADI128, 0, 0 }, + + { MASK_SSE2, CODE_FOR_sse2_pmaddwd, 0, IX86_BUILTIN_PMADDWD128, 0, 0 }, + { MASK_SSE2, CODE_FOR_cvtsi2sd, 0, IX86_BUILTIN_CVTSI2SD, 0, 0 }, { MASK_SSE2, CODE_FOR_cvtsd2ss, 0, IX86_BUILTIN_CVTSD2SS, 0, 0 }, { MASK_SSE2, CODE_FOR_cvtss2sd, 0, IX86_BUILTIN_CVTSS2SD, 0, 0 } @@ -11270,6 +11298,7 @@ ix86_init_mmx_sse_builtins () tree pchar_type_node = build_pointer_type (char_type_node); tree pfloat_type_node = build_pointer_type (float_type_node); tree pv2si_type_node = build_pointer_type (V2SI_type_node); + tree pv2di_type_node = build_pointer_type (V2DI_type_node); tree pdi_type_node = build_pointer_type (long_long_unsigned_type_node); /* Comparisons. */ @@ -11334,11 +11363,6 @@ ix86_init_mmx_sse_builtins () tree_cons (NULL_TREE, integer_type_node, endlink)))); - tree v4hi_ftype_v8qi_v8qi - = build_function_type (V4HI_type_node, - tree_cons (NULL_TREE, V8QI_type_node, - tree_cons (NULL_TREE, V8QI_type_node, - endlink))); tree v2si_ftype_v4hi_v4hi = build_function_type (V2SI_type_node, tree_cons (NULL_TREE, V4HI_type_node, @@ -11411,6 +11435,12 @@ ix86_init_mmx_sse_builtins () tree_cons (NULL_TREE, long_long_unsigned_type_node, endlink))); + tree void_ftype_pv2di_v2di + = build_function_type (void_type_node, + tree_cons (NULL_TREE, pv2di_type_node, + tree_cons (NULL_TREE, + V2DI_type_node, + endlink))); /* Normal vector unops. */ tree v4sf_ftype_v4sf = build_function_type (V4SF_type_node, @@ -11629,6 +11659,11 @@ ix86_init_mmx_sse_builtins () tree_cons (NULL_TREE, integer_type_node, endlink)))); + tree v2di_ftype_v2di_int + = build_function_type (V2DI_type_node, + tree_cons (NULL_TREE, V2DI_type_node, + tree_cons (NULL_TREE, integer_type_node, + endlink))); tree v4si_ftype_v4si_int = build_function_type (V4SI_type_node, tree_cons (NULL_TREE, V4SI_type_node, @@ -11639,6 +11674,34 @@ ix86_init_mmx_sse_builtins () tree_cons (NULL_TREE, V8HI_type_node, tree_cons (NULL_TREE, integer_type_node, endlink))); + tree v8hi_ftype_v8hi_v2di + = build_function_type (V8HI_type_node, + tree_cons (NULL_TREE, V8HI_type_node, + tree_cons (NULL_TREE, V2DI_type_node, + endlink))); + tree v4si_ftype_v4si_v2di + = build_function_type (V4SI_type_node, + tree_cons (NULL_TREE, V4SI_type_node, + tree_cons (NULL_TREE, V2DI_type_node, + endlink))); + tree v4si_ftype_v8hi_v8hi + = build_function_type (V4SI_type_node, + tree_cons (NULL_TREE, V8HI_type_node, + tree_cons (NULL_TREE, V8HI_type_node, + endlink))); + tree di_ftype_v8qi_v8qi + = build_function_type (long_long_unsigned_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + tree_cons (NULL_TREE, V8QI_type_node, + endlink))); + tree v2di_ftype_v16qi_v16qi + = build_function_type (V2DI_type_node, + tree_cons (NULL_TREE, V16QI_type_node, + tree_cons (NULL_TREE, V16QI_type_node, + endlink))); + tree int_ftype_v16qi + = build_function_type (integer_type_node, + tree_cons (NULL_TREE, V16QI_type_node, endlink)); /* Add all builtins that are more or less simple operations on two operands. */ @@ -11775,7 +11838,7 @@ ix86_init_mmx_sse_builtins () def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_sfence", void_ftype_void, IX86_BUILTIN_SFENCE); - def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_psadbw", v4hi_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW); + def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_psadbw", di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW); def_builtin (MASK_SSE1, "__builtin_ia32_rcpps", v4sf_ftype_v4sf, IX86_BUILTIN_RCPPS); def_builtin (MASK_SSE1, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS); @@ -11838,15 +11901,15 @@ ix86_init_mmx_sse_builtins () def_builtin (MASK_SSE2, "__builtin_ia32_storelpd", void_ftype_pv2si_v2df, IX86_BUILTIN_STORELPD); def_builtin (MASK_SSE2, "__builtin_ia32_movmskpd", int_ftype_v2df, IX86_BUILTIN_MOVMSKPD); - def_builtin (MASK_SSE2, "__builtin_ia32_pmovmskb128", int_ftype_v8qi, IX86_BUILTIN_PMOVMSKB128); + def_builtin (MASK_SSE2, "__builtin_ia32_pmovmskb128", int_ftype_v16qi, IX86_BUILTIN_PMOVMSKB128); def_builtin (MASK_SSE2, "__builtin_ia32_movnti", void_ftype_pint_int, IX86_BUILTIN_MOVNTI); def_builtin (MASK_SSE2, "__builtin_ia32_movntpd", void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTPD); - def_builtin (MASK_SSE2, "__builtin_ia32_movntdq", void_ftype_pdi_di, IX86_BUILTIN_MOVNTDQ); + def_builtin (MASK_SSE2, "__builtin_ia32_movntdq", void_ftype_pv2di_v2di, IX86_BUILTIN_MOVNTDQ); def_builtin (MASK_SSE2, "__builtin_ia32_pshufd", v4si_ftype_v4si_int, IX86_BUILTIN_PSHUFD); def_builtin (MASK_SSE2, "__builtin_ia32_pshuflw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFLW); def_builtin (MASK_SSE2, "__builtin_ia32_pshufhw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFHW); - def_builtin (MASK_SSE2, "__builtin_ia32_psadbw128", v4hi_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW128); + def_builtin (MASK_SSE2, "__builtin_ia32_psadbw128", v2di_ftype_v16qi_v16qi, IX86_BUILTIN_PSADBW128); def_builtin (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD); def_builtin (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD); @@ -11854,7 +11917,7 @@ ix86_init_mmx_sse_builtins () def_builtin (MASK_SSE2, "__builtin_ia32_shufpd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_SHUFPD); def_builtin (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PD); + def_builtin (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS); def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ); def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI); @@ -11886,6 +11949,30 @@ ix86_init_mmx_sse_builtins () def_builtin (MASK_SSE2, "__builtin_ia32_clflush", void_ftype_pvoid, IX86_BUILTIN_CLFLUSH); def_builtin (MASK_SSE2, "__builtin_ia32_lfence", void_ftype_void, IX86_BUILTIN_LFENCE); def_builtin (MASK_SSE2, "__builtin_ia32_mfence", void_ftype_void, IX86_BUILTIN_MFENCE); + + def_builtin (MASK_SSE2, "__builtin_ia32_psllw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSLLW128); + def_builtin (MASK_SSE2, "__builtin_ia32_pslld128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSLLD128); + def_builtin (MASK_SSE2, "__builtin_ia32_psllq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSLLQ128); + + def_builtin (MASK_SSE2, "__builtin_ia32_psrlw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSRLW128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrld128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSRLD128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrlq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSRLQ128); + + def_builtin (MASK_SSE2, "__builtin_ia32_psraw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSRAW128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrad128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSRAD128); + + def_builtin (MASK_SSE2, "__builtin_ia32_psllwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSLLWI128); + def_builtin (MASK_SSE2, "__builtin_ia32_pslldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSLLDI128); + def_builtin (MASK_SSE2, "__builtin_ia32_psllqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLQI128); + + def_builtin (MASK_SSE2, "__builtin_ia32_psrlwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRLWI128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRLDI128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrlqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLQI128); + + def_builtin (MASK_SSE2, "__builtin_ia32_psrawi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRAWI128); + def_builtin (MASK_SSE2, "__builtin_ia32_psradi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRADI128); + + def_builtin (MASK_SSE2, "__builtin_ia32_pmaddwd128", v4si_ftype_v8hi_v8hi, IX86_BUILTIN_PMADDWD128); } /* Errors in the source file can cause expand_expr to return const0_rtx @@ -12681,7 +12768,7 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) case IX86_BUILTIN_MOVNTPD: return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, arglist); case IX86_BUILTIN_MOVNTDQ: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntti, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, arglist); case IX86_BUILTIN_MOVNTI: return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, arglist); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 912efffff6b..5fff4b62443 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -104,6 +104,7 @@ ;; 58 This is a `sfence' operation. ;; 59 This is a `mfence' operation. ;; 60 This is a `lfence' operation. +;; 61 This is a `psadbw' operation. ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls ;; from i386.c. @@ -18593,6 +18594,15 @@ [(set_attr "type" "sselog") (set_attr "mode" "TI")]) +(define_insn "sse2_andv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (and:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "%0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 + && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" + "pand\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + (define_insn "*sse_nandti3_df" [(set (subreg:TI (match_operand:DF 0 "register_operand" "=Y") 0) (and:TI (not:TI (subreg:TI (match_operand:DF 1 "register_operand" "0") 0)) @@ -18628,6 +18638,15 @@ "pandn\t{%2, %0|%0, %2}" [(set_attr "type" "sselog")]) +(define_insn "sse2_nandv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (and:V2DI (not:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "%0")) + (match_operand:V2DI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 + && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" + "pandn\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + (define_insn "*sse_iorti3_df_1" [(set (subreg:TI (match_operand:DF 0 "register_operand" "=Y") 0) (ior:TI (subreg:TI (match_operand:DF 1 "register_operand" "%0") 0) @@ -18684,6 +18703,15 @@ [(set_attr "type" "sselog") (set_attr "mode" "TI")]) +(define_insn "sse2_iorv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (ior:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "%0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 + && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" + "por\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + (define_insn "*sse_xorti3_df_1" [(set (subreg:TI (match_operand:DF 0 "register_operand" "=Y") 0) (xor:TI (subreg:TI (match_operand:DF 1 "register_operand" "%0") 0) @@ -18740,6 +18768,15 @@ [(set_attr "type" "sselog") (set_attr "mode" "TI")]) +(define_insn "sse2_xorv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (xor:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "%0") + (match_operand:V2DI 2 "nonimmediate_operand" "xm")))] + "TARGET_SSE2 + && (GET_CODE (operands[1]) != MEM || GET_CODE (operands[2]) != MEM)" + "pxor\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + ;; Use xor, but don't show input operands so they aren't live before ;; this insn. (define_insn "sse_clrv4sf" @@ -19279,9 +19316,9 @@ (set_attr "mode" "DI")]) (define_insn "mmx_psadbw" - [(set (match_operand:V8QI 0 "register_operand" "=y") - (abs:V8QI (minus:V8QI (match_operand:V8QI 1 "register_operand" "0") - (match_operand:V8QI 2 "nonimmediate_operand" "ym"))))] + [(set (match_operand:DI 0 "register_operand" "=y") + (unspec:DI [(match_operand:V8QI 1 "register_operand" "0") + (match_operand:V8QI 2 "nonimmediate_operand" "ym")] 61))] "TARGET_SSE || TARGET_3DNOW_A" "psadbw\t{%2, %0|%0, %2}" [(set_attr "type" "mmxshft") @@ -20250,8 +20287,8 @@ (define_insn "sse2_anddf3" [(set (match_operand:V2DF 0 "register_operand" "=x") - (subreg:V2DF (and:TI (subreg:TI (match_operand:TI 1 "register_operand" "%0") 0) - (subreg:TI (match_operand:TI 2 "nonimmediate_operand" "xm") 0)) 0))] + (subreg:V2DF (and:TI (subreg:TI (match_operand:V2DF 1 "register_operand" "%0") 0) + (subreg:TI (match_operand:V2DF 2 "nonimmediate_operand" "xm") 0)) 0))] "TARGET_SSE2" "andpd\t{%2, %0|%0, %2}" [(set_attr "type" "sselog") @@ -20259,8 +20296,8 @@ (define_insn "sse2_nanddf3" [(set (match_operand:V2DF 0 "register_operand" "=x") - (subreg:V2DF (and:TI (not:TI (subreg:TI (match_operand:TI 1 "register_operand" "0") 0)) - (subreg:TI (match_operand:TI 2 "nonimmediate_operand" "xm") 0)) 0))] + (subreg:V2DF (and:TI (not:TI (subreg:TI (match_operand:V2DF 1 "register_operand" "0") 0)) + (subreg:TI (match_operand:V2DF 2 "nonimmediate_operand" "xm") 0)) 0))] "TARGET_SSE2" "andnpd\t{%2, %0|%0, %2}" [(set_attr "type" "sselog") @@ -20268,8 +20305,8 @@ (define_insn "sse2_iordf3" [(set (match_operand:V2DF 0 "register_operand" "=x") - (subreg:V2DF (ior:TI (subreg:TI (match_operand:TI 1 "register_operand" "%0") 0) - (subreg:TI (match_operand:TI 2 "nonimmediate_operand" "xm") 0)) 0))] + (subreg:V2DF (ior:TI (subreg:TI (match_operand:V2DF 1 "register_operand" "%0") 0) + (subreg:TI (match_operand:V2DF 2 "nonimmediate_operand" "xm") 0)) 0))] "TARGET_SSE2" "orpd\t{%2, %0|%0, %2}" [(set_attr "type" "sselog") @@ -20277,8 +20314,8 @@ (define_insn "sse2_xordf3" [(set (match_operand:V2DF 0 "register_operand" "=x") - (subreg:V2DF (xor:TI (subreg:TI (match_operand:TI 1 "register_operand" "%0") 0) - (subreg:TI (match_operand:TI 2 "nonimmediate_operand" "xm") 0)) 0))] + (subreg:V2DF (xor:TI (subreg:TI (match_operand:V2DF 1 "register_operand" "%0") 0) + (subreg:TI (match_operand:V2DF 2 "nonimmediate_operand" "xm") 0)) 0))] "TARGET_SSE2" "xorpd\t{%2, %0|%0, %2}" [(set_attr "type" "sselog") @@ -20418,9 +20455,9 @@ [(set_attr "type" "ssecvt") (set_attr "mode" "V2DF")]) -(define_insn "sse2_movntti" - [(set (match_operand:TI 0 "memory_operand" "=m") - (unspec:TI [(match_operand:TI 1 "register_operand" "x")] 34))] +(define_insn "sse2_movntv2di" + [(set (match_operand:V2DI 0 "memory_operand" "=m") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x")] 34))] "TARGET_SSE2" "movntdq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") @@ -20467,7 +20504,7 @@ (define_insn "cvtdq2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") (float:V2DF (vec_select:V2SI - (match_operand:V2SI 1 "nonimmediate_operand" "xm") + (match_operand:V4SI 1 "nonimmediate_operand" "xm") (parallel [(const_int 0) (const_int 1)]))))] @@ -20784,11 +20821,14 @@ [(set_attr "type" "sseimul") (set_attr "mode" "TI")]) -;; See the MMX logical operations for the reason for the unspec (define_insn "sse2_umulsidi3" [(set (match_operand:DI 0 "register_operand" "=y") - (unspec:DI [(mult:DI (zero_extend:DI (match_operand:DI 1 "register_operand" "0")) - (zero_extend:DI (match_operand:DI 2 "nonimmediate_operand" "ym")))] 45))] + (mult:DI (zero_extend:DI (vec_select:SI + (match_operand:V2SI 1 "register_operand" "0") + (parallel [(const_int 0)]))) + (zero_extend:DI (vec_select:SI + (match_operand:V2SI 2 "nonimmediate_operand" "ym") + (parallel [(const_int 0)])))))] "TARGET_SSE2" "pmuludq\t{%2, %0|%0, %2}" [(set_attr "type" "sseimul") @@ -20889,9 +20929,9 @@ ;; @@@ this isn't the right representation. (define_insn "sse2_psadbw" - [(set (match_operand:V16QI 0 "register_operand" "=x") - (abs:V16QI (minus:V16QI (match_operand:V16QI 1 "register_operand" "0") - (match_operand:V16QI 2 "nonimmediate_operand" "ym"))))] + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V16QI 1 "register_operand" "0") + (match_operand:V16QI 2 "nonimmediate_operand" "ym")] 61))] "TARGET_SSE2" "psadbw\t{%2, %0|%0, %2}" [(set_attr "type" "sseiadd") @@ -21050,7 +21090,7 @@ (define_insn "ashrv8hi3" [(set (match_operand:V8HI 0 "register_operand" "=x") (ashiftrt:V8HI (match_operand:V8HI 1 "register_operand" "0") - (match_operand:TI 2 "nonmemory_operand" "xi")))] + (match_operand:SI 2 "nonmemory_operand" "ri")))] "TARGET_SSE2" "psraw\t{%2, %0|%0, %2}" [(set_attr "type" "sseishft") @@ -21059,7 +21099,7 @@ (define_insn "ashrv4si3" [(set (match_operand:V4SI 0 "register_operand" "=x") (ashiftrt:V4SI (match_operand:V4SI 1 "register_operand" "0") - (match_operand:TI 2 "nonmemory_operand" "xi")))] + (match_operand:SI 2 "nonmemory_operand" "ri")))] "TARGET_SSE2" "psrad\t{%2, %0|%0, %2}" [(set_attr "type" "sseishft") @@ -21068,7 +21108,7 @@ (define_insn "lshrv8hi3" [(set (match_operand:V8HI 0 "register_operand" "=x") (lshiftrt:V8HI (match_operand:V8HI 1 "register_operand" "0") - (match_operand:TI 2 "nonmemory_operand" "xi")))] + (match_operand:SI 2 "nonmemory_operand" "ri")))] "TARGET_SSE2" "psrlw\t{%2, %0|%0, %2}" [(set_attr "type" "sseishft") @@ -21077,16 +21117,16 @@ (define_insn "lshrv4si3" [(set (match_operand:V4SI 0 "register_operand" "=x") (lshiftrt:V4SI (match_operand:V4SI 1 "register_operand" "0") - (match_operand:TI 2 "nonmemory_operand" "xi")))] + (match_operand:SI 2 "nonmemory_operand" "ri")))] "TARGET_SSE2" "psrld\t{%2, %0|%0, %2}" [(set_attr "type" "sseishft") (set_attr "mode" "TI")]) -(define_insn "sse2_lshrv2di3" +(define_insn "lshrv2di3" [(set (match_operand:V2DI 0 "register_operand" "=x") (lshiftrt:V2DI (match_operand:V2DI 1 "register_operand" "0") - (match_operand:TI 2 "nonmemory_operand" "xi")))] + (match_operand:SI 2 "nonmemory_operand" "ri")))] "TARGET_SSE2" "psrlq\t{%2, %0|%0, %2}" [(set_attr "type" "sseishft") @@ -21095,7 +21135,7 @@ (define_insn "ashlv8hi3" [(set (match_operand:V8HI 0 "register_operand" "=x") (ashift:V8HI (match_operand:V8HI 1 "register_operand" "0") - (match_operand:TI 2 "nonmemory_operand" "xi")))] + (match_operand:SI 2 "nonmemory_operand" "ri")))] "TARGET_SSE2" "psllw\t{%2, %0|%0, %2}" [(set_attr "type" "sseishft") @@ -21104,16 +21144,80 @@ (define_insn "ashlv4si3" [(set (match_operand:V4SI 0 "register_operand" "=x") (ashift:V4SI (match_operand:V4SI 1 "register_operand" "0") - (match_operand:TI 2 "nonmemory_operand" "xi")))] + (match_operand:SI 2 "nonmemory_operand" "ri")))] + "TARGET_SSE2" + "pslld\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + +(define_insn "ashlv2di3" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (ashift:V2DI (match_operand:V2DI 1 "register_operand" "0") + (match_operand:SI 2 "nonmemory_operand" "ri")))] + "TARGET_SSE2" + "psllq\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + +(define_insn "ashrv8hi3_ti" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (ashiftrt:V8HI (match_operand:V8HI 1 "register_operand" "0") + (subreg:TI (match_operand:V2DI 2 "nonmemory_operand" "xi") 0)))] + "TARGET_SSE2" + "psraw\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + +(define_insn "ashrv4si3_ti" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (ashiftrt:V4SI (match_operand:V4SI 1 "register_operand" "0") + (subreg:TI (match_operand:V2DI 2 "nonmemory_operand" "xi") 0)))] + "TARGET_SSE2" + "psrad\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + +(define_insn "lshrv8hi3_ti" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (lshiftrt:V8HI (match_operand:V8HI 1 "register_operand" "0") + (subreg:TI (match_operand:V2DI 2 "nonmemory_operand" "xi") 0)))] + "TARGET_SSE2" + "psrlw\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + +(define_insn "lshrv4si3_ti" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (lshiftrt:V4SI (match_operand:V4SI 1 "register_operand" "0") + (subreg:TI (match_operand:V2DI 2 "nonmemory_operand" "xi") 0)))] + "TARGET_SSE2" + "psrld\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + +(define_insn "lshrv2di3_ti" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (lshiftrt:V2DI (match_operand:V2DI 1 "register_operand" "0") + (subreg:TI (match_operand:V2DI 2 "nonmemory_operand" "xi") 0)))] + "TARGET_SSE2" + "psrlq\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + +(define_insn "ashlv8hi3_ti" + [(set (match_operand:V8HI 0 "register_operand" "=x") + (ashift:V8HI (match_operand:V8HI 1 "register_operand" "0") + (subreg:TI (match_operand:V2DI 2 "nonmemory_operand" "xi") 0)))] + "TARGET_SSE2" + "psllw\t{%2, %0|%0, %2}" + [(set_attr "type" "sse")]) + +(define_insn "ashlv4si3_ti" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (ashift:V4SI (match_operand:V4SI 1 "register_operand" "0") + (subreg:TI (match_operand:V2DI 2 "nonmemory_operand" "xi") 0)))] "TARGET_SSE2" "pslld\t{%2, %0|%0, %2}" [(set_attr "type" "sseishft") (set_attr "mode" "TI")]) -(define_insn "sse2_ashlv2di3" +(define_insn "ashlv2di3_ti" [(set (match_operand:V2DI 0 "register_operand" "=x") (ashift:V2DI (match_operand:V2DI 1 "register_operand" "0") - (match_operand:TI 2 "nonmemory_operand" "xi")))] + (subreg:TI (match_operand:V2DI 2 "nonmemory_operand" "xi") 0)))] "TARGET_SSE2" "psllq\t{%2, %0|%0, %2}" [(set_attr "type" "sseishft") diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index 9f9f2f99393..294df600cb9 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -1058,4 +1058,927 @@ do { \ (row3) = __builtin_ia32_shufps (__t2, __t3, 0xDD); \ } while (0) +/* SSE2 */ +typedef int __v2df __attribute__ ((mode (V2DF))); +typedef int __v2di __attribute__ ((mode (V2DI))); +typedef int __v4si __attribute__ ((mode (V4SI))); +typedef int __v8hi __attribute__ ((mode (V8HI))); +typedef int __v16qi __attribute__ ((mode (V16QI))); + +#define __m128i __m128 +#define __m128d __v2df + +static __inline __m128d +_mm_add_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_add_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_sub_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_sub_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_mul_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_mul_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_div_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_div_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_sqrt_pd (__m128d __A) +{ + return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); +} + +static __inline __m128d +_mm_sqrt_sd (__m128d __A) +{ + return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__A); +} + +static __inline __m128d +_mm_min_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_min_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_max_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_max_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_and_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_andnot_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_or_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_xor_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpeq_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmplt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmple_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpgt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpge_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpneq_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpnlt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpnle_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpngt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpnge_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpord_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpunord_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpeq_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmplt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmple_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpgt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgtsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpge_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgesd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpneq_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpnlt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpnle_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpngt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngtsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpnge_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngesd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpord_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cmpunord_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_comieq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_comilt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_comile_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_comigt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_comige_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_comineq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_ucomieq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_ucomilt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_ucomile_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_ucomigt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_ucomige_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); +} + +static __inline int +_mm_ucomineq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_cvtepi32_pd (__m128i __A) +{ + return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); +} + +static __inline __m128d +_mm_cvtepi32_ps (__m128i __A) +{ + return (__m128d)__builtin_ia32_cvtdq2ps ((__v4si) __A); +} + +static __inline __m128d +_mm_cvtpd_epi32 (__m128d __A) +{ + return (__m128d)__builtin_ia32_cvtpd2dq ((__v2df) __A); +} + +static __inline __m64 +_mm_cvtpd_pi32 (__m128d __A) +{ + return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); +} + +static __inline __m128d +_mm_cvtpd_ps (__m128d __A) +{ + return (__m128d)__builtin_ia32_cvtpd2ps ((__v2df) __A); +} + +static __inline __m128d +_mm_cvttpd_epi32 (__m128d __A) +{ + return (__m128d)__builtin_ia32_cvttpd2dq ((__v2df) __A); +} + +static __inline __m64 +_mm_cvttpd_pi32 (__m128d __A) +{ + return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); +} + +static __inline __m128d +_mm_cvtpi32_pd (__m64 __A) +{ + return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); +} + +static __inline __m128d +_mm_cvtps_epi32 (__m128d __A) +{ + return (__m128d)__builtin_ia32_cvtps2dq ((__v4sf) __A); +} + +static __inline __m128d +_mm_cvttps_epi32 (__m128d __A) +{ + return (__m128d)__builtin_ia32_cvttps2dq ((__v4sf) __A); +} + +static __inline __m128d +_mm_cvtps_pd (__m128d __A) +{ + return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); +} + +static __inline int +_mm_cvtsd_si32 (__m128d __A) +{ + return __builtin_ia32_cvtsd2si ((__v2df) __A); +} + +static __inline int +_mm_cvttsd_si32 (__m128d __A) +{ + return __builtin_ia32_cvttsd2si ((__v2df) __A); +} + +static __inline __m128d +_mm_cvtsd_ss (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); +} + +static __inline __m128d +_mm_cvtsi32_sd (__m128d __A, int __B) +{ + return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); +} + +static __inline __m128d +_mm_cvtss_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); +} + +#define _mm_shuffle_pd(__A, __B, __C) ((__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, (C))) + +static __inline __m128d +_mm_unpackhi_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_unpacklo_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_loadh_pd (__m128d __A, __m128d *__B) +{ + return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, (__v2si *)__B); +} + +static __inline void +_mm_storeh_pd (__m128d *__A, __m128d __B) +{ + __builtin_ia32_storehpd ((__v2si *)__A, (__v2df)__B); +} + +static __inline __m128d +_mm_loadl_pd (__m128d __A, __m128d *__B) +{ + return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, (__v2si *)__B); +} + +static __inline void +_mm_storel_pd (__m128d *__A, __m128d __B) +{ + __builtin_ia32_storelpd ((__v2si *)__A, (__v2df)__B); +} + +static __inline int +_mm_movemask_pd (__m128d __A) +{ + return __builtin_ia32_movmskpd ((__v2df)__A); +} + +static __inline __m128i +_mm_packs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_packs_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); +} + +static __inline __m128i +_mm_packus_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_unpackhi_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_unpackhi_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_unpackhi_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); +} + +static __inline __m128i +_mm_unpacklo_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_unpacklo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_unpacklo_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); +} + +static __inline __m128i +_mm_add_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_add_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_add_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); +} + +static __inline __m128i +_mm_add_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddq128 ((__v4si)__A, (__v4si)__B); +} + +static __inline __m128i +_mm_adds_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_adds_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_adds_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_adds_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_sub_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_sub_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_sub_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B); +} + +static __inline __m128i +_mm_sub_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubq128 ((__v4si)__A, (__v4si)__B); +} + +static __inline __m128i +_mm_subs_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_subs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_subs_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_subs_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_madd_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_mulhi_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_mullo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m64 +_mm_mul_pu16 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); +} + +static __inline __m128i +_mm_mul_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); +} + +static __inline __m128i +_mm_sll_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psllw128 ((__v8hi)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_sll_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pslld128 ((__v4si)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_sll_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psllq128 ((__v2di)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_sra_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_sra_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_srl_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_srl_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_srl_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_slli_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); +} + +static __inline __m128i +_mm_slli_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); +} + +static __inline __m128i +_mm_slli_epi64 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); +} + +static __inline __m128i +_mm_srai_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); +} + +static __inline __m128i +_mm_srai_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); +} + +static __inline __m128i +_mm_srli_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); +} + +static __inline __m128i +_mm_srli_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); +} + +static __inline __m128i +_mm_srli_epi64 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); +} + +static __inline __m128i +_mm_and_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_andnot_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_or_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_xor_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B); +} + +static __inline __m128i +_mm_cmpeq_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_cmpeq_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_cmpeq_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B); +} + +static __inline __m128i +_mm_cmpgt_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_cmpgt_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_cmpgt_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B); +} + +#define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B) + +#define _mm_insert_epi16 (__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C)) + +static __inline __m128i +_mm_max_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_max_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_min_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_min_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline int +_mm_movemask_epi8 (__m128i __A) +{ + return __builtin_ia32_pmovmskb128 ((__v16qi)__A); +} + +static __inline __m128i +_mm_mulhi_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); +} + +#define _mm_shufflehi_epi16(__A, __B) ((__m128i)__builtin_ia32_pshufhw128 ((__v8hi)__A, __B)) +#define _mm_shufflelo_epi16(__A, __B) ((__m128i)__builtin_ia32_pshuflw128 ((__v8hi)__A, __B)) +#define _mm_shuffle_epi32(__A, __B) ((__m128i)__builtin_ia32_pshufd ((__v4si)__A, __B)) + +static __inline void +_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) +{ + __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); +} + +static __inline __m128i +_mm_avg_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline __m128i +_mm_avg_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); +} + +static __inline __m128i +_mm_sad_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); +} + +static __inline void +_mm_stream_si32 (int *__A, int __B) +{ + __builtin_ia32_movnti (__A, __B); +} + +static __inline void +_mm_stream_si128 (__m128i *__A, __m128i __B) +{ + __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); +} + +static __inline void +_mm_stream_pd (__m128d *__A, __m128d __B) +{ + __builtin_ia32_movntpd (__A, (__v2df)__B); +} + +static __inline __m128i +_mm_movpi64_epi64 (__m64 __A) +{ + return (__m128i)__builtin_ia32_movq2dq ((unsigned long long)__A); +} + +static __inline void +_mm_clflush (void *__A) +{ + return __builtin_ia32_clflush (__A); +} + +static __inline void +_mm_lfence (void) +{ + __builtin_ia32_lfence (); +} + +static __inline void +_mm_mfence (void) +{ + __builtin_ia32_mfence (); +} + +/* End of SSE2. */ + + #endif /* _XMMINTRIN_H_INCLUDED */ |