2009-09-30 Basile Starynkevitch <basile@starynkevitch.net>

[merged with trunk while LTO is merging inside trunk, however the trunk's gengtype should now work for MELT, unchanged except by addition of "melt-runtime.h"] MELT branch merged with trunk rev 152324 --This line, and those below, will be ignored- - _M . M configure M Makefile.in M libgomp/configure M libgomp/ChangeLog M libgomp/acinclude.m4 M config.guess M gcc/tree-vrp.c M gcc/doc/plugins.texi M gcc/doc/extend.texi M gcc/doc/tm.texi M gcc/doc/invoke.texi M gcc/doc/gty.texi M gcc/doc/install.texi M gcc/tree-into-ssa.c M gcc/targhooks.c M gcc/tree-complex.c M gcc/targhooks.h M gcc/gengtype.c M gcc/java/builtins.c M gcc/java/ChangeLog M gcc/java/lang.c M gcc/optabs.c M gcc/optabs.h M gcc/DATESTAMP M gcc/value-prof.c M gcc/tree.c M gcc/tree.h M gcc/tree-pass.h M gcc/target.h M gcc/configure M gcc/builtins.c M gcc/final.c M gcc/fold-const.c M gcc/cfgloopanal.c M gcc/toplev.c M gcc/ChangeLog A + gcc/testsuite/gcc.c-torture/execute/pr41463.c M gcc/testsuite/gcc.c-torture/execute/980526-2.c A + gcc/testsuite/gcc.c-torture/execute/pr41395-2.c A + gcc/testsuite/gcc.c-torture/execute/ifcvt-onecmpl-abs-1.c A + gcc/testsuite/gcc.c-torture/execute/pr41395-1.c A + gcc/testsuite/gcc.c-torture/compile/pr39779.c A + gcc/testsuite/gcc.c-torture/compile/pr41469.c A + gcc/testsuite/gcc.target/alpha/pr22093.c M gcc/testsuite/gcc.target/i386/i386.exp M gcc/testsuite/gcc.target/i386/isa-1.c A + gcc/testsuite/gcc.target/i386/isa-12.c A + gcc/testsuite/gcc.target/i386/fma4-maccXX.c A + gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c A + gcc/testsuite/gcc.target/i386/fma4-check.h A + gcc/testsuite/gcc.target/i386/fma4-msubXX.c A + gcc/testsuite/gcc.target/i386/isa-3.c A + gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c M gcc/testsuite/gcc.target/i386/isa-14.c A + gcc/testsuite/gcc.target/i386/fma4-fma.c M gcc/testsuite/gcc.target/i386/isa-5.c A + gcc/testsuite/gcc.target/i386/isa-7.c A + gcc/testsuite/gcc.target/i386/funcspec-2.c A + gcc/testsuite/gcc.target/i386/isa-9.c M gcc/testsuite/gcc.target/i386/funcspec-4.c M gcc/testsuite/gcc.target/i386/pr12329.c A + gcc/testsuite/gcc.target/i386/ifcvt-onecmpl-abs-1.c M gcc/testsuite/gcc.target/i386/funcspec-6.c A + gcc/testsuite/gcc.target/i386/fma4-vector.c A + gcc/testsuite/gcc.target/i386/fma4-256-vector.c M gcc/testsuite/gcc.target/i386/funcspec-8.c A + gcc/testsuite/gcc.target/i386/sse-12.c M gcc/testsuite/gcc.target/i386/avx-2.c A + gcc/testsuite/gcc.target/i386/isa-11.c A + gcc/testsuite/gcc.target/i386/sse-14.c M gcc/testsuite/gcc.target/i386/sse-23.c A + gcc/testsuite/gcc.target/i386/isa-2.c A + gcc/testsuite/gcc.target/i386/isa-13.c A + gcc/testsuite/gcc.target/i386/isa-4.c A + gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c M gcc/testsuite/gcc.target/i386/isa-6.c A + gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c A + gcc/testsuite/gcc.target/i386/isa-8.c A + gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c M gcc/testsuite/gcc.target/i386/funcspec-5.c A + gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c M gcc/testsuite/gcc.target/i386/avx-1.c R + gcc/testsuite/gcc.target/i386/funcspec-9.c A + gcc/testsuite/gcc.target/i386/isa-10.c A + gcc/testsuite/gcc.target/i386/sse-13.c A + gcc/testsuite/gcc.target/i386/sse-22.c _M gcc/testsuite/gcc.target/x86_64/abi/callabi A + gcc/testsuite/gcc.target/vsx-vectorize-1.c M gcc/testsuite/gcc.target/mips/code-readable-1.c M gcc/testsuite/gcc.target/mips/code-readable-2.c M gcc/testsuite/gcc.target/mips/code-readable-3.c A + gcc/testsuite/gcc.target/vsx-vectorize-2.c A + gcc/testsuite/gcc.target/vsx-vectorize-3.c A + gcc/testsuite/gcc.target/vsx-vectorize-4.c A + gcc/testsuite/gcc.target/vsx-vectorize-5.c A + gcc/testsuite/gcc.target/vsx-vectorize-6.c A + gcc/testsuite/gcc.target/vsx-vectorize-7.c A + gcc/testsuite/gcc.target/vsx-vectorize-8.c M gcc/testsuite/gnat.dg/array7.adb M gcc/testsuite/gnat.dg/array7.ads A + gcc/testsuite/gnat.dg/nested_proc.adb A + gcc/testsuite/gnat.dg/array9.adb A + gcc/testsuite/gnat.dg/sse_nolib.adb A + gcc/testsuite/gnat.dg/tagged_alloc_free.adb A + gcc/testsuite/gcc.dg/pr41470.c A + gcc/testsuite/gcc.dg/guality/pr41353-1.c M gcc/testsuite/gcc.dg/guality/guality.h M gcc/testsuite/gcc.dg/guality/guality.exp A + gcc/testsuite/gcc.dg/debug/dwarf2/global-used-types.c M gcc/testsuite/gcc.dg/debug/dwarf2/const-1.c A + gcc/testsuite/gcc.dg/cond-constqual-1.c A + gcc/testsuite/gcc.dg/20090922-1.c A + gcc/testsuite/gcc.dg/pr40209.c M gcc/testsuite/gcc.dg/builtins-44.c A + gcc/testsuite/gcc.dg/pr41454.c M gcc/testsuite/gcc.dg/torture/builtin-math-7.c _M gcc/testsuite/gcc.dg/torture/pr36227.c A + gcc/testsuite/gcc.dg/tree-ssa/pr41469-1.c M gcc/testsuite/gcc.dg/tree-ssa/fre-vce-1.c M gcc/testsuite/gcc.dg/tree-ssa/forwprop-6.c A + gcc/testsuite/gcc.dg/pr41248.c A + gcc/testsuite/gcc.dg/pr41295.c M gcc/testsuite/gcc.dg/vect/vect.exp M gcc/testsuite/ChangeLog M gcc/testsuite/gcc.test-framework/dg-bogus-exp-XF.c M gcc/testsuite/gcc.test-framework/dg-warning-exp-P.c M gcc/testsuite/g++.dg/other/i386-2.C M gcc/testsuite/g++.dg/other/i386-6.C M gcc/testsuite/g++.dg/other/i386-3.C M gcc/testsuite/g++.dg/other/i386-5.C M gcc/testsuite/g++.dg/tree-ssa/pr19637.C A + gcc/testsuite/g++.dg/tree-ssa/pr41428.C M gcc/testsuite/g++.dg/dg.exp A + gcc/testsuite/g++.dg/debug/dwarf2/global-used-types-1.C M gcc/testsuite/g++.dg/debug/dwarf2/explicit-constructor.C M gcc/testsuite/g++.dg/debug/dwarf2/imported-module-2.C M gcc/testsuite/g++.dg/debug/dwarf2/imported-module-3.C M gcc/testsuite/g++.dg/debug/dwarf2/const1.C M gcc/testsuite/g++.dg/debug/dwarf2/imported-module-4.C M gcc/testsuite/g++.dg/debug/dwarf2/template-func-params-4.C M gcc/testsuite/g++.dg/debug/dwarf2/template-func-params-7.C M gcc/testsuite/g++.dg/debug/dwarf2/namespace-1.C M gcc/testsuite/g++.dg/debug/dwarf2/template-params-4.C M gcc/testsuite/g++.dg/vect/vect.exp A + gcc/testsuite/g++.dg/cpp0x/lambda A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-lookup-neg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-copy-neg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-type.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-nested.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-field-names.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-errloc.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-deduce.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-copy-default.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-const-neg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-deduce-ext-neg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-ref-default.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-mangle.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-pass.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-mixed.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-this.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-capture-const-ref-neg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-copy.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-ref.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-std-function.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-ctor-neg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-non-const.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-deduce-ext-neg2.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-in-class-neg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-const.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-deduce-ext.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-ns-scope.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-defarg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-mutable.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-ctors.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-use.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-eh.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-nop.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-deduce-neg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-capture-const-ref.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-copy-default-neg.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-recursive.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-in-class.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-template.C A + gcc/testsuite/g++.dg/cpp0x/lambda/lambda-array.C _M gcc/testsuite/g++.dg/cpp0x/decltype-38655.C A + gcc/testsuite/g++.dg/gomp/pr41429.C A + gcc/testsuite/g++.dg/torture/pr38747.C A + gcc/testsuite/g++.dg/dfp A + gcc/testsuite/g++.dg/dfp/dfp.exp A + gcc/testsuite/g++.dg/dfp/mangle-mode.C M gcc/testsuite/objc.dg/dwarf-2.m M gcc/testsuite/lib/profopt.exp M gcc/testsuite/lib/scanasm.exp M gcc/testsuite/lib/prune.exp M gcc/testsuite/lib/target-supports.exp M gcc/testsuite/gfortran.dg/warnings_are_errors_1.f A + gcc/testsuite/gfortran.dg/block_2.f08 A + gcc/testsuite/gfortran.dg/block_4.f08 A + gcc/testsuite/gfortran.dg/round_1.f03 A + gcc/testsuite/gfortran.dg/block_6.f08 M gcc/testsuite/gfortran.dg/debug/pr37738.f M gcc/testsuite/gfortran.dg/debug/pr35154-dwarf2.f A + gcc/testsuite/gfortran.dg/block_8.f08 A + gcc/testsuite/gfortran.dg/empty_label.f M gcc/testsuite/gfortran.dg/f2003_io_3.f03 A + gcc/testsuite/gfortran.dg/block_1.f08 A + gcc/testsuite/gfortran.dg/block_5.f08 A + gcc/testsuite/gfortran.dg/block_3.f90 A + gcc/testsuite/gfortran.dg/block_7.f08 A + gcc/testsuite/gfortran.dg/empty_label.f90 M gcc/testsuite/gfortran.dg/vect/vect.exp M gcc/testsuite/c-c++-common/dfp/func-vararg-mixed.c M gcc/testsuite/c-c++-common/dfp/func-vararg-alternate-d32.c M gcc/testsuite/c-c++-common/dfp/func-vararg-mixed-2.c M gcc/testsuite/c-c++-common/dfp/func-vararg-dfp.c M gcc/unwind-dw2-fde-glibc.c M gcc/df-scan.c M gcc/objcp/Make-lang.in M gcc/objcp/ChangeLog M gcc/objcp/objcp-lang.c M gcc/cp/typeck.c M gcc/cp/class.c M gcc/cp/decl.c M gcc/cp/method.c M gcc/cp/error.c M gcc/cp/tree.c M gcc/cp/ChangeLog M gcc/cp/cp-gimplify.c M gcc/cp/typeck2.c M gcc/cp/cp-lang.c M gcc/cp/pt.c M gcc/cp/semantics.c M gcc/cp/parser.c M gcc/cp/cp-tree.def M gcc/cp/cp-objcp-common.c M gcc/cp/cp-objcp-common.h _M gcc/cp/ChangeLog-2007 _M gcc/cp/ChangeLog-2008 M gcc/cp/mangle.c M gcc/cp/cp-tree.h M gcc/cp/search.c M gcc/cp/name-lookup.c M gcc/cp/lex.c M gcc/tree-ssa-ccp.c M gcc/builtins.def M gcc/tree-ssa-dom.c M gcc/tree-ssa-propagate.c M gcc/tree-ssa-propagate.h M gcc/crtstuff.c M gcc/ifcvt.c M gcc/dwarf2out.c M gcc/expr.h M gcc/libgcc2.c M gcc/ada/ChangeLog M gcc/ada/gcc-interface/utils.c M gcc/ada/gcc-interface/Makefile.in M gcc/ada/gcc-interface/decl.c M gcc/ada/gcc-interface/targtyps.c M gcc/ada/gcc-interface/utils2.c M gcc/ada/gcc-interface/gigi.h M gcc/ada/gcc-interface/trans.c M gcc/ada/gcc-interface/ada-tree.h M gcc/ada/gcc-interface/ada.h M gcc/ada/gcc-interface/misc.c M gcc/melt-runtime.c M gcc/c-decl.c M gcc/tree-eh.c M gcc/fortran/decl.c M gcc/fortran/gfortran.h M gcc/fortran/error.c M gcc/fortran/ChangeLog M gcc/fortran/trans-stmt.c M gcc/fortran/trans-stmt.h M gcc/fortran/trans.c M gcc/fortran/trans.h M gcc/fortran/io.c M gcc/fortran/resolve.c M gcc/fortran/f95-lang.c M gcc/fortran/st.c M gcc/fortran/match.c M gcc/fortran/trans-decl.c M gcc/fortran/match.h M gcc/fortran/parse.c M gcc/fortran/parse.h M gcc/fortran/simplify.c M gcc/configure.ac M gcc/function.c M gcc/langhooks.h M gcc/function.h M gcc/stor-layout.c M gcc/alias.c M gcc/ggc.h M gcc/c-typeck.c M gcc/gcc-plugin.h M gcc/calls.c M gcc/tree-ssa-coalesce.c M gcc/ggc-common.c M gcc/tree-dfa.c M gcc/except.c M gcc/emit-rtl.c M gcc/except.h M gcc/cfgexpand.c M gcc/c-opts.c M gcc/loop-iv.c M gcc/print-tree.c M gcc/tree-ssa-copy.c M gcc/tree-ssa-forwprop.c M gcc/common.opt M gcc/varasm.c M gcc/tree-nested.c M gcc/target-def.h M gcc/rtl.h M gcc/tree-vect-stmts.c M gcc/tree-inline.c M gcc/var-tracking.c M gcc/system.h M gcc/plugin.c M gcc/c-common.c M gcc/tree-flow.h M gcc/c-common.h M gcc/config.gcc M gcc/Makefile.in M gcc/tree-cfg.c M gcc/passes.c M gcc/c-parser.c M gcc/config/alpha/vms.h M gcc/config/alpha/alpha.c M gcc/config/alpha/alpha.h M gcc/config/alpha/openbsd.h M gcc/config/alpha/alpha.md M gcc/config/frv/frv.h M gcc/config/frv/frv-protos.h M gcc/config/frv/frv.c M gcc/config/s390/s390.c M gcc/config/s390/s390.h M gcc/config/s390/s390-protos.h M gcc/config/m32c/m32c.c M gcc/config/m32c/m32c.h M gcc/config/m32c/m32c-protos.h M gcc/config/spu/spu-protos.h M gcc/config/spu/spu.c M gcc/config/spu/spu.h M gcc/config/sparc/sparc.md M gcc/config/sparc/sparc-protos.h M gcc/config/sparc/sparc.c M gcc/config/sparc/sparc.h M gcc/config/mep/mep-protos.h M gcc/config/mep/mep.c M gcc/config/mep/mep.h M gcc/config/m32r/m32r.c M gcc/config/m32r/m32r.h A + gcc/config/openbsd-stdint.h M gcc/config/i386/i386.h M gcc/config/i386/cygming.h M gcc/config/i386/cygwin.h M gcc/config/i386/i386.md M gcc/config/i386/smmintrin.h M gcc/config/i386/cpuid.h M gcc/config/i386/x86intrin.h M gcc/config/i386/sse.md M gcc/config/i386/i386-c.c M gcc/config/i386/i386.opt A + gcc/config/i386/fma4intrin.h M gcc/config/i386/openbsdelf.h D gcc/config/i386/mmintrin-common.h M gcc/config/i386/mingw32.h M gcc/config/i386/i386-protos.h M gcc/config/i386/i386.c D gcc/config/sh/symbian.c M gcc/config/sh/sh-protos.h A + gcc/config/sh/symbian-base.c A + gcc/config/sh/symbian-cxx.c M gcc/config/sh/symbian-post.h M gcc/config/sh/sh.c A + gcc/config/sh/sh-symbian.h M gcc/config/sh/sh.h A + gcc/config/sh/symbian-c.c M gcc/config/sh/t-symbian M gcc/config/pdp11/pdp11.c M gcc/config/pdp11/pdp11.h M gcc/config/avr/avr.c M gcc/config/avr/avr.h M gcc/config/crx/crx.h M gcc/config/xtensa/xtensa.c M gcc/config/xtensa/xtensa.h M gcc/config/xtensa/xtensa-protos.h M gcc/config/stormy16/stormy16.c M gcc/config/stormy16/stormy16.h M gcc/config/fr30/fr30.h M gcc/config/fr30/fr30.c M gcc/config/moxie/moxie.c M gcc/config/moxie/moxie.h M gcc/config/m68hc11/m68hc11-protos.h M gcc/config/m68hc11/m68hc11.c M gcc/config/m68hc11/m68hc11.h A + gcc/config/openbsd-libpthread.h M gcc/config/cris/cris.c M gcc/config/cris/cris.h M gcc/config/netbsd.h M gcc/config/iq2000/iq2000.h M gcc/config/iq2000/iq2000.c M gcc/config/mn10300/mn10300.c M gcc/config/mn10300/mn10300.h M gcc/config/ia64/predicates.md M gcc/config/ia64/ia64.c M gcc/config/ia64/ia64.h M gcc/config/ia64/ia64.md M gcc/config/ia64/ia64-protos.h M gcc/config/m68k/openbsd.h M gcc/config/m68k/m68k.c M gcc/config/m68k/netbsd-elf.h M gcc/config/m68k/m68k.h M gcc/config/rs6000/predicates.md M gcc/config/rs6000/rs6000-protos.h M gcc/config/rs6000/rs6000.c M gcc/config/rs6000/vsx.md M gcc/config/rs6000/rs6000.h M gcc/config/rs6000/rs6000.md M gcc/config/picochip/picochip.c M gcc/config/picochip/picochip.h M gcc/config/picochip/picochip-protos.h M gcc/config/arc/arc.c M gcc/config/arc/arc.h M gcc/config/mcore/mcore.c M gcc/config/mcore/mcore.h M gcc/config/darwin.c M gcc/config/darwin.h M gcc/config/score/score3.c M gcc/config/score/score7.c M gcc/config/score/score3.h M gcc/config/score/score-protos.h M gcc/config/score/score7.h M gcc/config/score/score.c M gcc/config/score/score.h M gcc/config/arm/arm.c M gcc/config/arm/arm.h M gcc/config/pa/pa-protos.h M gcc/config/pa/pa.c M gcc/config/pa/pa.h M gcc/config/mips/openbsd.h M gcc/config/mips/mips.c M gcc/config/mips/mips.h M gcc/config/t-freebsd M gcc/config/openbsd.h M gcc/config/freebsd-spec.h M gcc/config/vax/vax.c M gcc/config/vax/openbsd.h M gcc/config/vax/vax.h M gcc/config/h8300/h8300.c M gcc/config/h8300/h8300.h M gcc/config/v850/v850.c M gcc/config/v850/v850.h M gcc/config/mmix/mmix.h M gcc/config/mmix/mmix-protos.h M gcc/config/mmix/mmix.c M gcc/config/bfin/bfin-protos.h M gcc/config/bfin/bfin.c M gcc/config/bfin/bfin.h M gcc/stmt.c M gcc/collect2.c M gcc/langhooks-def.h M gcc/reload1.c M config.sub M libstdc++-v3/configure M libstdc++-v3/doc/xml/manual/intro.xml M libstdc++-v3/doc/html/ext/lwg-closed.html M libstdc++-v3/doc/html/ext/lwg-active.html M libstdc++-v3/doc/html/ext/lwg-defects.html M libstdc++-v3/include/parallel/multiway_merge.h M libstdc++-v3/include/parallel/find_selectors.h M libstdc++-v3/include/parallel/losertree.h M libstdc++-v3/include/parallel/list_partition.h M libstdc++-v3/include/parallel/for_each.h M libstdc++-v3/include/parallel/multiseq_selection.h M libstdc++-v3/include/parallel/workstealing.h M libstdc++-v3/include/parallel/base.h M libstdc++-v3/include/parallel/par_loop.h M libstdc++-v3/include/parallel/numeric M libstdc++-v3/include/parallel/equally_split.h M libstdc++-v3/include/parallel/for_each_selectors.h M libstdc++-v3/include/parallel/omp_loop_static.h M libstdc++-v3/include/parallel/random_shuffle.h M libstdc++-v3/include/parallel/merge.h M libstdc++-v3/include/parallel/multiway_mergesort.h M libstdc++-v3/include/parallel/numericfwd.h M libstdc++-v3/include/parallel/search.h M libstdc++-v3/include/parallel/partition.h M libstdc++-v3/include/parallel/algobase.h M libstdc++-v3/include/parallel/find.h M libstdc++-v3/include/parallel/algo.h M libstdc++-v3/include/parallel/omp_loop.h M libstdc++-v3/include/parallel/checkers.h M libstdc++-v3/include/parallel/sort.h M libstdc++-v3/include/bits/random.tcc M libstdc++-v3/include/tr1/random.tcc M libstdc++-v3/libsupc++/eh_terminate.cc M libstdc++-v3/libsupc++/vec.cc M libstdc++-v3/libsupc++/vterminate.cc M libstdc++-v3/libsupc++/new_opnt.cc M libstdc++-v3/ChangeLog _M libstdc++-v3/testsuite/27_io/basic_ofstream/cons/char/1.cc _M libstdc++-v3/testsuite/27_io/basic_ofstream/pthread2.cc _M libstdc++-v3/testsuite/27_io/basic_fstream/open/char/1.cc _M libstdc++-v3/testsuite/27_io/basic_fstream/cons/1.cc _M libstdc++-v3/testsuite/27_io/basic_ostringstream/pthread3.cc _M libstdc++-v3/testsuite/18_support/pthread_guard.cc _M libstdc++-v3/testsuite/21_strings/basic_string/pthread18185.cc _M libstdc++-v3/testsuite/21_strings/basic_string/pthread4.cc _M libstdc++-v3/testsuite/30_threads/condition_variable/members _M libstdc++-v3/testsuite/30_threads/this_thread _M libstdc++-v3/testsuite/30_threads/thread/members _M libstdc++-v3/testsuite/30_threads/thread/swap A + libstdc++-v3/testsuite/26_numerics/random/subtract_with_carry_engine/requirements/constants.cc A + libstdc++-v3/testsuite/26_numerics/random/mersenne_twister_engine/requirements/constants.cc A + libstdc++-v3/testsuite/26_numerics/random/linear_congruential_engine/requirements/constants.cc A + libstdc++-v3/testsuite/26_numerics/random/shuffle_order_engine/requirements/constants.cc _M libstdc++-v3/testsuite/ext/rope/pthread7-rope.cc A + libstdc++-v3/testsuite/tr1/5_numerical_facilities/random/subtract_with_carry_01/requirements/constants.cc A + libstdc++-v3/testsuite/tr1/5_numerical_facilities/random/subtract_with_carry/requirements/constants.cc A + libstdc++-v3/testsuite/tr1/5_numerical_facilities/random/discard_block/requirements/constants.cc A + libstdc++-v3/testsuite/tr1/5_numerical_facilities/random/linear_congruential/requirements/constants.cc A + libstdc++-v3/testsuite/tr1/5_numerical_facilities/random/mersenne_twister/requirements/constants.cc A + libstdc++-v3/testsuite/tr1/5_numerical_facilities/random/xor_combine/requirements/constants.cc _M libstdc++-v3/testsuite/23_containers/list/pthread1.cc _M libstdc++-v3/testsuite/23_containers/list/pthread5.cc _M libstdc++-v3/testsuite/23_containers/map/pthread6.cc _M libstdc++-v3/testsuite/20_util/unique_ptr/assign/assign_neg.cc _M libstdc++-v3/testsuite/20_util/ratio/cons/cons_overflow_neg.cc _M libstdc++-v3/testsuite/20_util/ratio/operations/ops_overflow_neg.cc M libstdc++-v3/acinclude.m4 M configure.ac M libgfortran/configure M libgfortran/ChangeLog M libgfortran/config.h.in M libgfortran/configure.ac M libgfortran/io/io.h M libgfortran/io/unit.c M libgfortran/io/transfer.c M libgfortran/io/format.c M libgfortran/io/write_float.def M Makefile.tpl M libmudflap/ChangeLog M libmudflap/mf-runtime.c M include/ChangeLog M include/demangle.h M include/dwarf2.h M libiberty/ChangeLog M libiberty/testsuite/demangle-expected M libiberty/cplus-dem.c M libiberty/Makefile.in M libiberty/choose-temp.c M libiberty/cp-demangle.c M ChangeLog M ChangeLog.MELT M libffi/src/arm/sysv.S M libffi/src/x86/win64.S M libffi/ChangeLog M libjava/Makefile.in M libjava/libltdl/Makefile.in M libjava/libltdl/ltdl.h M libjava/libltdl/ChangeLog M libjava/libltdl/Makefile.am M libjava/configure.host M libjava/configure.ac _M libjava/classpath M libjava/ChangeLog M libjava/sysdep/i386/backtrace.h M libjava/configure M libjava/Makefile.am M libcpp/configure M libcpp/files.c M libcpp/ChangeLog M libcpp/configure.ac M MAINTAINERS git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/melt-branch@152327 138bc75d-0d04-0410-961f-82ee72b054a4
author: bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> 2009-09-30 14:23:29 +0000
committer: bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4> 2009-09-30 14:23:29 +0000
commit: 92b17925e053f1044328100acee8e6ddca553d05 (patch)
tree: 56dda32931a24ed4b4924be0c66a5f961af318ec /gcc/config/i386
parent: 833adb8d7a201ed21bff6fa9d2f9ed95c105dbc3 (diff)
download: gcc-92b17925e053f1044328100acee8e6ddca553d05.tar.gz
16 files changed, 2038 insertions, 275 deletions
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index 2d0916fb36a..49acfa780e4 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -48,7 +48,7 @@
 /* %ecx */
 #define bit_LAHF_LM	(1 << 0)
 #define bit_SSE4a	(1 << 6)
-#define bit_SSE5	(1 << 11)
+#define bit_FMA4	(1 << 16)
 
 /* %edx */
 #define bit_LM		(1 << 29)
diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h
index 5fc0c1d41c5..43003cc5cad 100644
--- a/gcc/config/i386/cygming.h
+++ b/gcc/config/i386/cygming.h
@@ -383,7 +383,7 @@ do {						\
 /* FIXME: SUPPORTS_WEAK && TARGET_HAVE_NAMED_SECTIONS is true,
    but for .jcr section to work we also need crtbegin and crtend
    objects.  */
-#define TARGET_USE_JCR_SECTION 0
+#define TARGET_USE_JCR_SECTION 1
 
 /* Decide whether it is safe to use a local alias for a virtual function
    when constructing thunks.  */
diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
index d1d00f7a79b..933263769f9 100644
--- a/gcc/config/i386/cygwin.h
+++ b/gcc/config/i386/cygwin.h
@@ -267,3 +267,7 @@ while (0)
 #define LIBGCC_EH_EXTN "-sjlj"
 #endif
 #define LIBGCC_SONAME "cyggcc_s" LIBGCC_EH_EXTN "-1.dll"
+
+/* We should find a way to not have to update this manually.  */
+#define LIBGCJ_SONAME "cyggcj" /*LIBGCC_EH_EXTN*/ "-11.dll"
+
diff --git a/gcc/config/i386/fma4intrin.h b/gcc/config/i386/fma4intrin.h
new file mode 100644
index 00000000000..42782ade0ed
--- /dev/null
+++ b/gcc/config/i386/fma4intrin.h
@@ -0,0 +1,245 @@
+/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _FMA4INTRIN_H_INCLUDED
+#define _FMA4INTRIN_H_INCLUDED
+
+#ifndef __FMA4__
+# error "FMA4 instruction set not enabled"
+#else
+
+/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files.  */
+#include <ammintrin.h>
+
+/* Internal data types for implementing the intrinsics.  */
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32),
+				     __may_alias__));
+typedef double __m256d __attribute__ ((__vector_size__ (32),
+				       __may_alias__));
+
+/* 128b Floating point multiply/add type instructions.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C)
+
+{
+  return (__m128) __builtin_ia32_vfmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmsubss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmsubsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmsubaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmsubaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+/* 256b Floating point multiply/add type instructions.  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C)
+
+{
+  return (__m256) __builtin_ia32_vfmsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmsubaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmsubaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+#endif
+
+#endif
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index 4c960e7d444..12a3f1759a8 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -230,6 +230,8 @@ ix86_target_macros_internal (int isa_flag,
     def_or_undef (parse_in, "__FMA__");
   if (isa_flag & OPTION_MASK_ISA_SSE4A)
     def_or_undef (parse_in, "__SSE4A__");
+  if (isa_flag & OPTION_MASK_ISA_FMA4)
+    def_or_undef (parse_in, "__FMA4__");
   if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE))
     def_or_undef (parse_in, "__SSE_MATH__");
   if ((fpmath & FPMATH_SSE) && (isa_flag & OPTION_MASK_ISA_SSE2))
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 389fc3cddd7..58da13168de 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -214,6 +214,9 @@ extern void ix86_expand_vector_set (bool, rtx, rtx, int);
 extern void ix86_expand_vector_extract (bool, rtx, rtx, int);
 extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx);
 
+extern bool ix86_fma4_valid_op_p (rtx [], rtx, int, bool, int, bool);
+extern void ix86_expand_fma4_multiple_memory (rtx [], int, enum machine_mode);
+
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
 extern void ix86_register_pragmas (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 335a5260bd1..9df01ba23dc 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1879,6 +1879,7 @@ static bool ext_80387_constants_init = 0;
 
 static struct machine_function * ix86_init_machine_status (void);
 static rtx ix86_function_value (const_tree, const_tree, bool);
+static rtx ix86_static_chain (const_tree, bool);
 static int ix86_function_regparm (const_tree, const_tree);
 static void ix86_compute_frame_layout (struct ix86_frame *);
 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
@@ -1954,6 +1955,9 @@ static int ix86_isa_flags_explicit;
 
 #define OPTION_MASK_ISA_SSE4A_SET \
   (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET)
+#define OPTION_MASK_ISA_FMA4_SET \
+  (OPTION_MASK_ISA_FMA4 | OPTION_MASK_ISA_SSE4A_SET \
+   | OPTION_MASK_ISA_AVX_SET)
 
 /* AES and PCLMUL need SSE2 because they use xmm registers */
 #define OPTION_MASK_ISA_AES_SET \
@@ -1994,7 +1998,8 @@ static int ix86_isa_flags_explicit;
 #define OPTION_MASK_ISA_SSE4_2_UNSET \
   (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_AVX_UNSET )
 #define OPTION_MASK_ISA_AVX_UNSET \
-  (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET)
+  (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET \
+   | OPTION_MASK_ISA_FMA4_UNSET)
 #define OPTION_MASK_ISA_FMA_UNSET OPTION_MASK_ISA_FMA
 
 /* SSE4 includes both SSE4.1 and SSE4.2.  -mno-sse4 should the same
@@ -2002,7 +2007,10 @@ static int ix86_isa_flags_explicit;
 #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
 
 #define OPTION_MASK_ISA_SSE4A_UNSET \
-  (OPTION_MASK_ISA_SSE4A)
+  (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_FMA4_UNSET)
+
+#define OPTION_MASK_ISA_FMA4_UNSET OPTION_MASK_ISA_FMA4
+
 #define OPTION_MASK_ISA_AES_UNSET OPTION_MASK_ISA_AES
 #define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
 #define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
@@ -2236,6 +2244,19 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
 	}
       return true;
 
+    case OPT_mfma4:
+      if (value)
+	{
+	  ix86_isa_flags |= OPTION_MASK_ISA_FMA4_SET;
+	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_SET;
+	}
+      else
+	{
+	  ix86_isa_flags &= ~OPTION_MASK_ISA_FMA4_UNSET;
+	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_UNSET;
+	}
+      return true;
+
     case OPT_mabm:
       if (value)
 	{
@@ -2363,6 +2384,7 @@ ix86_target_string (int isa, int flags, const char *arch, const char *tune,
   static struct ix86_target_opts isa_opts[] =
   {
     { "-m64",		OPTION_MASK_ISA_64BIT },
+    { "-mfma4",		OPTION_MASK_ISA_FMA4 },
     { "-msse4a",	OPTION_MASK_ISA_SSE4A },
     { "-msse4.2",	OPTION_MASK_ISA_SSE4_2 },
     { "-msse4.1",	OPTION_MASK_ISA_SSE4_1 },
@@ -2592,7 +2614,8 @@ override_options (bool main_args_p)
       PTA_PCLMUL = 1 << 17,
       PTA_AVX = 1 << 18,
       PTA_FMA = 1 << 19,
-      PTA_MOVBE = 1 << 20
+      PTA_MOVBE = 1 << 20,
+      PTA_FMA4 = 1 << 21
     };
 
   static struct pta
@@ -2935,6 +2958,9 @@ override_options (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_SSE4A
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
+	if (processor_alias_table[i].flags & PTA_FMA4
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
+	  ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
 	if (processor_alias_table[i].flags & PTA_ABM
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
 	  ix86_isa_flags |= OPTION_MASK_ISA_ABM;
@@ -3618,6 +3644,7 @@ ix86_valid_target_attribute_inner_p (tree args, char *p_strings[])
     IX86_ATTR_ISA ("sse4.2",	OPT_msse4_2),
     IX86_ATTR_ISA ("sse4a",	OPT_msse4a),
     IX86_ATTR_ISA ("ssse3",	OPT_mssse3),
+    IX86_ATTR_ISA ("fma4",	OPT_mfma4),
 
     /* string options */
     IX86_ATTR_STR ("arch=",	IX86_FUNCTION_SPECIFIC_ARCH),
@@ -4472,8 +4499,6 @@ ix86_function_regparm (const_tree type, const_tree decl)
   tree attr;
   int regparm;
 
-  static bool error_issued;
-
   if (TARGET_64BIT)
     return (ix86_function_type_abi (type) == SYSV_ABI
 	    ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
@@ -4482,23 +4507,7 @@ ix86_function_regparm (const_tree type, const_tree decl)
   attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
   if (attr)
     {
-      regparm
-	= TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
-
-      if (decl && TREE_CODE (decl) == FUNCTION_DECL)
-	{
-	  /* We can't use regparm(3) for nested functions because
-	     these pass static chain pointer in %ecx register.  */
-	  if (!error_issued && regparm == 3
-	      && decl_function_context (decl)
-	      && !DECL_NO_STATIC_CHAIN (decl))
-	    {
-	      error ("nested functions are limited to 2 register parameters");
-	      error_issued = true;
-	      return 0;
-	    }
-	}
-
+      regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
       return regparm;
     }
 
@@ -4512,7 +4521,7 @@ ix86_function_regparm (const_tree type, const_tree decl)
       && !profile_flag)
     {
       /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified.  */
-      struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
+      struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
       if (i && i->local)
 	{
 	  int local_regparm, globals = 0, regno;
@@ -4523,11 +4532,9 @@ ix86_function_regparm (const_tree type, const_tree decl)
 	    if (fixed_regs[local_regparm])
 	      break;
 
-	  /* We can't use regparm(3) for nested functions as these use
-	     static chain pointer in third argument.  */
-	  if (local_regparm == 3
-	      && decl_function_context (decl)
-	      && !DECL_NO_STATIC_CHAIN (decl))
+	  /* We don't want to use regparm(3) for nested functions as
+	     these use a static chain pointer in the third argument.  */
+	  if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
 	    local_regparm = 2;
 
 	  /* Each fixed register usage increases register pressure,
@@ -7873,9 +7880,16 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
   else
     frame->save_regs_using_mov = false;
 
+  /* Skip return address.  */
+  offset = UNITS_PER_WORD;
 
-  /* Skip return address and saved base pointer.  */
-  offset = frame_pointer_needed ? UNITS_PER_WORD * 2 : UNITS_PER_WORD;
+  /* Skip pushed static chain.  */
+  if (ix86_static_chain_on_stack)
+    offset += UNITS_PER_WORD;
+
+  /* Skip saved base pointer.  */
+  if (frame_pointer_needed)
+    offset += UNITS_PER_WORD;
 
   frame->hard_frame_pointer_offset = offset;
 
@@ -8138,9 +8152,7 @@ find_drap_reg (void)
 	 Since function with tail call may use any caller-saved
 	 registers in epilogue, DRAP must not use caller-saved
 	 register in such case.  */
-      if ((decl_function_context (decl)
-	   && !DECL_NO_STATIC_CHAIN (decl))
-	  || crtl->tail_call_emit)
+      if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
 	return R13_REG;
 
       return R10_REG;
@@ -8151,9 +8163,7 @@ find_drap_reg (void)
 	 Since function with tail call may use any caller-saved
 	 registers in epilogue, DRAP must not use caller-saved
 	 register in such case.  */
-      if ((decl_function_context (decl)
-	   && !DECL_NO_STATIC_CHAIN (decl))
-	  || crtl->tail_call_emit)
+      if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
 	return DI_REG;
     
       /* Reuse static chain register if it isn't used for parameter
@@ -8297,20 +8307,42 @@ ix86_expand_prologue (void)
 
   ix86_compute_frame_layout (&frame);
 
+  /* The first insn of a function that accepts its static chain on the
+     stack is to push the register that would be filled in by a direct
+     call.  This insn will be skipped by the trampoline.  */
+  if (ix86_static_chain_on_stack)
+    {
+      rtx t;
+
+      insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
+      emit_insn (gen_blockage ());
+
+      /* We don't want to interpret this push insn as a register save,
+	 only as a stack adjustment.  The real copy of the register as
+	 a save will be done later, if needed.  */
+      t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
+      t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+
   /* Emit prologue code to adjust stack alignment and setup DRAP, in case
      of DRAP is needed and stack realignment is really needed after reload */
   if (crtl->drap_reg && crtl->stack_realign_needed)
     {
       rtx x, y;
       int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
-      int param_ptr_offset = (call_used_regs[REGNO (crtl->drap_reg)]
-			      ? 0 : UNITS_PER_WORD);
+      int param_ptr_offset = UNITS_PER_WORD;
+
+      if (ix86_static_chain_on_stack)
+	param_ptr_offset += UNITS_PER_WORD;
+      if (!call_used_regs[REGNO (crtl->drap_reg)])
+	param_ptr_offset += UNITS_PER_WORD;
 
       gcc_assert (stack_realign_drap);
 
       /* Grab the argument pointer.  */
-      x = plus_constant (stack_pointer_rtx, 
-                         (UNITS_PER_WORD + param_ptr_offset));
+      x = plus_constant (stack_pointer_rtx, param_ptr_offset);
       y = crtl->drap_reg;
 
       /* Only need to push parameter pointer reg if it is caller
@@ -8519,14 +8551,18 @@ ix86_expand_prologue (void)
       /* vDRAP is setup but after reload it turns out stack realign
          isn't necessary, here we will emit prologue to setup DRAP
          without stack realign adjustment */
+      rtx x;
       int drap_bp_offset = UNITS_PER_WORD * 2;
-      rtx x = plus_constant (hard_frame_pointer_rtx, drap_bp_offset);
+
+      if (ix86_static_chain_on_stack)
+	drap_bp_offset += UNITS_PER_WORD;
+      x = plus_constant (hard_frame_pointer_rtx, drap_bp_offset);
       insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, x));
     }
 
   /* Prevent instructions from being scheduled into register save push
      sequence when access to the redzone area is done through frame pointer.
-     The offset betweeh the frame pointer and the stack pointer is calculated
+     The offset between the frame pointer and the stack pointer is calculated
      relative to the value of the stack pointer at the end of the function
      prologue, and moving instructions that access redzone area via frame
      pointer inside push sequence violates this assumption.  */
@@ -8575,11 +8611,11 @@ ix86_emit_restore_reg_using_pop (rtx reg, HOST_WIDE_INT red_offset)
 	   && reg == hard_frame_pointer_rtx)
     {
       ix86_cfa_state->reg = stack_pointer_rtx;
-      ix86_cfa_state->offset = UNITS_PER_WORD;
+      ix86_cfa_state->offset -= UNITS_PER_WORD;
 
       add_reg_note (insn, REG_CFA_DEF_CFA,
 		    gen_rtx_PLUS (Pmode, stack_pointer_rtx,
-				  GEN_INT (UNITS_PER_WORD)));
+				  GEN_INT (ix86_cfa_state->offset)));
       RTX_FRAME_RELATED_P (insn) = 1;
     }
 
@@ -8613,6 +8649,9 @@ ix86_emit_leave (HOST_WIDE_INT red_offset)
 
   if (ix86_cfa_state->reg == hard_frame_pointer_rtx)
     {
+      ix86_cfa_state->reg = stack_pointer_rtx;
+      ix86_cfa_state->offset -= UNITS_PER_WORD;
+
       add_reg_note (insn, REG_CFA_ADJUST_CFA, 
 		    copy_rtx (XVECEXP (PATTERN (insn), 0, 0)));
       RTX_FRAME_RELATED_P (insn) = 1;
@@ -8757,6 +8796,8 @@ ix86_expand_epilogue (int style)
   else if (stack_realign_fp)
     red_offset -= crtl->stack_alignment_needed / BITS_PER_UNIT
 		  - UNITS_PER_WORD;
+  if (ix86_static_chain_on_stack)
+    red_offset -= UNITS_PER_WORD;
   if (frame_pointer_needed)
     red_offset -= UNITS_PER_WORD;
 
@@ -8829,6 +8870,8 @@ ix86_expand_epilogue (int style)
 
 	  /* Stack align doesn't work with eh_return.  */
 	  gcc_assert (!crtl->stack_realign_needed);
+	  /* Neither does regparm nested functions.  */
+	  gcc_assert (!ix86_static_chain_on_stack);
 
 	  if (frame_pointer_needed)
 	    {
@@ -8961,29 +9004,50 @@ ix86_expand_epilogue (int style)
 
   if (using_drap)
     {
-      int param_ptr_offset = (call_used_regs[REGNO (crtl->drap_reg)]
-			      ? 0 : UNITS_PER_WORD);
+      int param_ptr_offset = UNITS_PER_WORD;
       rtx insn;
 
       gcc_assert (stack_realign_drap);
 
+      if (ix86_static_chain_on_stack)
+	param_ptr_offset += UNITS_PER_WORD;
+      if (!call_used_regs[REGNO (crtl->drap_reg)])
+	param_ptr_offset += UNITS_PER_WORD;
+
       insn = emit_insn ((*ix86_gen_add3) (stack_pointer_rtx,
 					  crtl->drap_reg,
-					  GEN_INT (-(UNITS_PER_WORD
-						     + param_ptr_offset))));
+					  GEN_INT (-param_ptr_offset)));
 
       ix86_cfa_state->reg = stack_pointer_rtx;
-      ix86_cfa_state->offset = UNITS_PER_WORD + param_ptr_offset;
+      ix86_cfa_state->offset = param_ptr_offset;
 
       add_reg_note (insn, REG_CFA_DEF_CFA,
 		    gen_rtx_PLUS (Pmode, ix86_cfa_state->reg,
 				  GEN_INT (ix86_cfa_state->offset)));
       RTX_FRAME_RELATED_P (insn) = 1;
 
-      if (param_ptr_offset)
+      if (!call_used_regs[REGNO (crtl->drap_reg)])
 	ix86_emit_restore_reg_using_pop (crtl->drap_reg, -UNITS_PER_WORD);
     }
 
+  /* Remove the saved static chain from the stack.  The use of ECX is
+     merely as a scratch register, not as the actual static chain.  */
+  if (ix86_static_chain_on_stack)
+    {
+      rtx r, insn;
+
+      gcc_assert (ix86_cfa_state->reg == stack_pointer_rtx);
+      ix86_cfa_state->offset += UNITS_PER_WORD;
+    
+      r = gen_rtx_REG (Pmode, CX_REG);
+      insn = emit_insn (ix86_gen_pop1 (r));
+
+      r = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
+      r = gen_rtx_SET (VOIDmode, stack_pointer_rtx, r);
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+
   /* Sibcall epilogues don't want a return instruction.  */
   if (style == 0)
     {
@@ -19759,66 +19823,162 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode,
   return align;
 }
 
+/* Find a location for the static chain incoming to a nested function.
+   This is a register, unless all free registers are used by arguments.  */
+
+static rtx
+ix86_static_chain (const_tree fndecl, bool incoming_p)
+{
+  unsigned regno;
+
+  if (!DECL_STATIC_CHAIN (fndecl))
+    return NULL;
+
+  if (TARGET_64BIT)
+    {
+      /* We always use R10 in 64-bit mode.  */
+      regno = R10_REG;
+    }
+  else
+    {
+      tree fntype;
+      /* By default in 32-bit mode we use ECX to pass the static chain.  */
+      regno = CX_REG;
+
+      fntype = TREE_TYPE (fndecl);
+      if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)))
+	{
+	  /* Fastcall functions use ecx/edx for arguments, which leaves
+	     us with EAX for the static chain.  */
+	  regno = AX_REG;
+	}
+      else if (ix86_function_regparm (fntype, fndecl) == 3)
+	{
+	  /* For regparm 3, we have no free call-clobbered registers in
+	     which to store the static chain.  In order to implement this,
+	     we have the trampoline push the static chain to the stack.
+	     However, we can't push a value below the return address when
+	     we call the nested function directly, so we have to use an
+	     alternate entry point.  For this we use ESI, and have the
+	     alternate entry point push ESI, so that things appear the
+	     same once we're executing the nested function.  */
+	  if (incoming_p)
+	    {
+	      if (fndecl == current_function_decl)
+		ix86_static_chain_on_stack = true;
+	      return gen_frame_mem (SImode,
+				    plus_constant (arg_pointer_rtx, -8));
+	    }
+	  regno = SI_REG;
+	}
+    }
+
+  return gen_rtx_REG (Pmode, regno);
+}
+
 /* Emit RTL insns to initialize the variable parts of a trampoline.
-   FNADDR is an RTX for the address of the function's pure code.
-   CXT is an RTX for the static chain value for the function.  */
-void
-x86_initialize_trampoline (rtx tramp, rtx fnaddr, rtx cxt)
+   FNDECL is the decl of the target address; M_TRAMP is a MEM for 
+   the trampoline, and CHAIN_VALUE is an RTX for the static chain
+   to be passed to the target function.  */
+
+static void
+ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
 {
+  rtx mem, fnaddr;
+
+  fnaddr = XEXP (DECL_RTL (fndecl), 0);
+
   if (!TARGET_64BIT)
     {
-      /* Compute offset from the end of the jmp to the target function.  */
-      rtx disp = expand_binop (SImode, sub_optab, fnaddr,
-			       plus_constant (tramp, 10),
-			       NULL_RTX, 1, OPTAB_DIRECT);
-      emit_move_insn (gen_rtx_MEM (QImode, tramp),
-		      gen_int_mode (0xb9, QImode));
-      emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 1)), cxt);
-      emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, 5)),
-		      gen_int_mode (0xe9, QImode));
-      emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 6)), disp);
+      rtx disp, chain;
+      int opcode;
+
+      /* Depending on the static chain location, either load a register
+	 with a constant, or push the constant to the stack.  All of the
+	 instructions are the same size.  */
+      chain = ix86_static_chain (fndecl, true);
+      if (REG_P (chain))
+	{
+	  if (REGNO (chain) == CX_REG)
+	    opcode = 0xb9;
+	  else if (REGNO (chain) == AX_REG)
+	    opcode = 0xb8;
+	  else
+	    gcc_unreachable ();
+	}
+      else
+	opcode = 0x68;
+
+      mem = adjust_address (m_tramp, QImode, 0);
+      emit_move_insn (mem, gen_int_mode (opcode, QImode));
+
+      mem = adjust_address (m_tramp, SImode, 1);
+      emit_move_insn (mem, chain_value);
+
+      /* Compute offset from the end of the jmp to the target function.
+	 In the case in which the trampoline stores the static chain on
+	 the stack, we need to skip the first insn which pushes the
+	 (call-saved) register static chain; this push is 1 byte.  */
+      disp = expand_binop (SImode, sub_optab, fnaddr,
+			   plus_constant (XEXP (m_tramp, 0),
+					  MEM_P (chain) ? 9 : 10),
+			   NULL_RTX, 1, OPTAB_DIRECT);
+
+      mem = adjust_address (m_tramp, QImode, 5);
+      emit_move_insn (mem, gen_int_mode (0xe9, QImode));
+
+      mem = adjust_address (m_tramp, SImode, 6);
+      emit_move_insn (mem, disp);
     }
   else
     {
       int offset = 0;
-      /* Try to load address using shorter movl instead of movabs.
-         We may want to support movq for kernel mode, but kernel does not use
-         trampolines at the moment.  */
+
+      /* Load the function address to r11.  Try to load address using
+	 the shorter movl instead of movabs.  We may want to support
+	 movq for kernel mode, but kernel does not use trampolines at
+	 the moment.  */
       if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
 	{
 	  fnaddr = copy_to_mode_reg (DImode, fnaddr);
-	  emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
-			  gen_int_mode (0xbb41, HImode));
-	  emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, offset + 2)),
-			  gen_lowpart (SImode, fnaddr));
+
+	  mem = adjust_address (m_tramp, HImode, offset);
+	  emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
+
+	  mem = adjust_address (m_tramp, SImode, offset + 2);
+	  emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
 	  offset += 6;
 	}
       else
 	{
-	  emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
-			  gen_int_mode (0xbb49, HImode));
-	  emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)),
-			  fnaddr);
+	  mem = adjust_address (m_tramp, HImode, offset);
+	  emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
+
+	  mem = adjust_address (m_tramp, DImode, offset + 2);
+	  emit_move_insn (mem, fnaddr);
 	  offset += 10;
 	}
+
       /* Load static chain using movabs to r10.  */
-      emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
-		      gen_int_mode (0xba49, HImode));
-      emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)),
-		      cxt);
+      mem = adjust_address (m_tramp, HImode, offset);
+      emit_move_insn (mem, gen_int_mode (0xba49, HImode));
+
+      mem = adjust_address (m_tramp, DImode, offset + 2);
+      emit_move_insn (mem, chain_value);
       offset += 10;
-      /* Jump to the r11 */
-      emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
-		      gen_int_mode (0xff49, HImode));
-      emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, offset+2)),
-		      gen_int_mode (0xe3, QImode));
-      offset += 3;
+
+      /* Jump to r11; the last (unused) byte is a nop, only there to
+	 pad the write out to a single 32-bit store.  */
+      mem = adjust_address (m_tramp, SImode, offset);
+      emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
+      offset += 4;
+
       gcc_assert (offset <= TRAMPOLINE_SIZE);
     }
 
 #ifdef ENABLE_EXECUTE_STACK
   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
-		     LCT_NORMAL, VOIDmode, 1, tramp, Pmode);
+		     LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
 #endif
 }
 
@@ -20552,6 +20712,39 @@ enum ix86_builtins
 
   IX86_BUILTIN_CVTUDQ2PS,
 
+  /* FMA4 instructions.  */
+  IX86_BUILTIN_VFMADDSS,
+  IX86_BUILTIN_VFMADDSD,
+  IX86_BUILTIN_VFMADDPS,
+  IX86_BUILTIN_VFMADDPD,
+  IX86_BUILTIN_VFMSUBSS,
+  IX86_BUILTIN_VFMSUBSD,
+  IX86_BUILTIN_VFMSUBPS,
+  IX86_BUILTIN_VFMSUBPD,
+  IX86_BUILTIN_VFMADDSUBPS,
+  IX86_BUILTIN_VFMADDSUBPD,
+  IX86_BUILTIN_VFMSUBADDPS,
+  IX86_BUILTIN_VFMSUBADDPD,
+  IX86_BUILTIN_VFNMADDSS,
+  IX86_BUILTIN_VFNMADDSD,
+  IX86_BUILTIN_VFNMADDPS,
+  IX86_BUILTIN_VFNMADDPD,
+  IX86_BUILTIN_VFNMSUBSS,
+  IX86_BUILTIN_VFNMSUBSD,
+  IX86_BUILTIN_VFNMSUBPS,
+  IX86_BUILTIN_VFNMSUBPD,
+  IX86_BUILTIN_VFMADDPS256,
+  IX86_BUILTIN_VFMADDPD256,
+  IX86_BUILTIN_VFMSUBPS256,
+  IX86_BUILTIN_VFMSUBPD256,
+  IX86_BUILTIN_VFMADDSUBPS256,
+  IX86_BUILTIN_VFMADDSUBPD256,
+  IX86_BUILTIN_VFMSUBADDPS256,
+  IX86_BUILTIN_VFMSUBADDPD256,
+  IX86_BUILTIN_VFNMADDPS256,
+  IX86_BUILTIN_VFNMADDPD256,
+  IX86_BUILTIN_VFNMSUBPS256,
+  IX86_BUILTIN_VFNMSUBPD256,
   IX86_BUILTIN_MAX
 };
 
@@ -21625,6 +21818,56 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
 };
 
+/* FMA4.  */
+enum multi_arg_type {
+  MULTI_ARG_UNKNOWN,
+  MULTI_ARG_3_SF,
+  MULTI_ARG_3_DF,
+  MULTI_ARG_3_SF2,
+  MULTI_ARG_3_DF2
+};
+
+static const struct builtin_description bdesc_multi_arg[] =
+{
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmaddv4sf4,     "__builtin_ia32_vfmaddss",    IX86_BUILTIN_VFMADDSS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmaddv2df4,     "__builtin_ia32_vfmaddsd",    IX86_BUILTIN_VFMADDSD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv4sf4,       "__builtin_ia32_vfmaddps",    IX86_BUILTIN_VFMADDPS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv2df4,       "__builtin_ia32_vfmaddpd",    IX86_BUILTIN_VFMADDPD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmsubv4sf4,     "__builtin_ia32_vfmsubss",    IX86_BUILTIN_VFMSUBSS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmsubv2df4,     "__builtin_ia32_vfmsubsd",    IX86_BUILTIN_VFMSUBSD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv4sf4,       "__builtin_ia32_vfmsubps",    IX86_BUILTIN_VFMSUBPS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv2df4,       "__builtin_ia32_vfmsubpd",    IX86_BUILTIN_VFMSUBPD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
+    
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmaddv4sf4,    "__builtin_ia32_vfnmaddss",   IX86_BUILTIN_VFNMADDSS,   UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmaddv2df4,    "__builtin_ia32_vfnmaddsd",   IX86_BUILTIN_VFNMADDSD,   UNKNOWN,      (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv4sf4,      "__builtin_ia32_vfnmaddps",   IX86_BUILTIN_VFNMADDPS,   UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv2df4,      "__builtin_ia32_vfnmaddpd",   IX86_BUILTIN_VFNMADDPD,   UNKNOWN,      (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmsubv4sf4,    "__builtin_ia32_vfnmsubss",   IX86_BUILTIN_VFNMSUBSS,   UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfnmsubv2df4,    "__builtin_ia32_vfnmsubsd",   IX86_BUILTIN_VFNMSUBSD,   UNKNOWN,      (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv4sf4,      "__builtin_ia32_vfnmsubps",   IX86_BUILTIN_VFNMSUBPS,   UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv2df4,      "__builtin_ia32_vfnmsubpd",   IX86_BUILTIN_VFNMSUBPD,   UNKNOWN,      (int)MULTI_ARG_3_DF },
+
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv4sf4,	   "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv2df4,	   "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv4sf4,	   "__builtin_ia32_vfmsubaddps", IX86_BUILTIN_VFMSUBADDPS,    UNKNOWN,      (int)MULTI_ARG_3_SF },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv2df4,	   "__builtin_ia32_vfmsubaddpd", IX86_BUILTIN_VFMSUBADDPD,    UNKNOWN,      (int)MULTI_ARG_3_DF },
+
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv8sf4256,       "__builtin_ia32_vfmaddps256",    IX86_BUILTIN_VFMADDPS256,    UNKNOWN,      (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddv4df4256,       "__builtin_ia32_vfmaddpd256",    IX86_BUILTIN_VFMADDPD256,    UNKNOWN,      (int)MULTI_ARG_3_DF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv8sf4256,       "__builtin_ia32_vfmsubps256",    IX86_BUILTIN_VFMSUBPS256,    UNKNOWN,      (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubv4df4256,       "__builtin_ia32_vfmsubpd256",    IX86_BUILTIN_VFMSUBPD256,    UNKNOWN,      (int)MULTI_ARG_3_DF2 },
+  
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv8sf4256,      "__builtin_ia32_vfnmaddps256",   IX86_BUILTIN_VFNMADDPS256,   UNKNOWN,      (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmaddv4df4256,      "__builtin_ia32_vfnmaddpd256",   IX86_BUILTIN_VFNMADDPD256,   UNKNOWN,      (int)MULTI_ARG_3_DF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv8sf4256,      "__builtin_ia32_vfnmsubps256",   IX86_BUILTIN_VFNMSUBPS256,   UNKNOWN,      (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fnmsubv4df4256,      "__builtin_ia32_vfnmsubpd256",   IX86_BUILTIN_VFNMSUBPD256,   UNKNOWN,      (int)MULTI_ARG_3_DF2 },
+
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv8sf4,	   "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,    UNKNOWN,      (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmaddsubv4df4,	   "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,    UNKNOWN,      (int)MULTI_ARG_3_DF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv8sf4,	   "__builtin_ia32_vfmsubaddps256", IX86_BUILTIN_VFMSUBADDPS256,    UNKNOWN,      (int)MULTI_ARG_3_SF2 },
+  { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmsubaddv4df4,	   "__builtin_ia32_vfmsubaddpd256", IX86_BUILTIN_VFMSUBADDPD256,    UNKNOWN,      (int)MULTI_ARG_3_DF2 }
+
+};
 
 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
    in the current target ISA to allow the user to compile particular modules
@@ -23058,6 +23301,29 @@ ix86_init_mmx_sse_builtins (void)
 				    intQI_type_node,
 				    integer_type_node, NULL_TREE);
   def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi", ftype, IX86_BUILTIN_VEC_SET_V16QI);
+  /* Add FMA4 multi-arg argument instructions */
+  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+    {
+      tree mtype = NULL_TREE;
+
+      if (d->name == 0)
+	continue;
+
+      switch ((enum multi_arg_type)d->flag)
+	{
+	case MULTI_ARG_3_SF:     mtype = v4sf_ftype_v4sf_v4sf_v4sf; 	break;
+	case MULTI_ARG_3_DF:     mtype = v2df_ftype_v2df_v2df_v2df; 	break;
+	case MULTI_ARG_3_SF2:    mtype = v8sf_ftype_v8sf_v8sf_v8sf; 	break;
+	case MULTI_ARG_3_DF2:    mtype = v4df_ftype_v4df_v4df_v4df; 	break;
+
+	case MULTI_ARG_UNKNOWN:
+	default:
+	  gcc_unreachable ();
+	}
+
+      if (mtype)
+	def_builtin_const (d->mask, d->name, mtype, d->code);
+    }
 }
 
 /* Internal method for ix86_init_builtins.  */
@@ -23230,6 +23496,122 @@ ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
   return target;
 }
 
+/* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns.  */
+
+static rtx
+ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
+			       enum multi_arg_type m_type,
+			       enum rtx_code sub_code)
+{
+  rtx pat;
+  int i;
+  int nargs;
+  bool comparison_p = false;
+  bool tf_p = false;
+  bool last_arg_constant = false;
+  int num_memory = 0;
+  struct {
+    rtx op;
+    enum machine_mode mode;
+  } args[4];
+
+  enum machine_mode tmode = insn_data[icode].operand[0].mode;
+
+  switch (m_type)
+    {
+    case MULTI_ARG_3_SF:
+    case MULTI_ARG_3_DF:
+    case MULTI_ARG_3_SF2:
+    case MULTI_ARG_3_DF2:
+      nargs = 3;
+      break;
+
+    case MULTI_ARG_UNKNOWN:
+    default:
+      gcc_unreachable ();
+    }
+
+  if (optimize || !target
+      || GET_MODE (target) != tmode
+      || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  gcc_assert (nargs <= 4);
+
+  for (i = 0; i < nargs; i++)
+    {
+      tree arg = CALL_EXPR_ARG (exp, i);
+      rtx op = expand_normal (arg);
+      int adjust = (comparison_p) ? 1 : 0;
+      enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
+
+      if (last_arg_constant && i == nargs-1)
+	{
+	  if (!CONST_INT_P (op))
+	    {
+	      error ("last argument must be an immediate");
+	      return gen_reg_rtx (tmode);
+	    }
+	}
+      else
+	{
+	  if (VECTOR_MODE_P (mode))
+	    op = safe_vector_operand (op, mode);
+
+	  /* If we aren't optimizing, only allow one memory operand to be
+	     generated.  */
+	  if (memory_operand (op, mode))
+	    num_memory++;
+
+	  gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
+
+	  if (optimize
+	      || ! (*insn_data[icode].operand[i+adjust+1].predicate) (op, mode)
+	      || num_memory > 1)
+	    op = force_reg (mode, op);
+	}
+
+      args[i].op = op;
+      args[i].mode = mode;
+    }
+
+  switch (nargs)
+    {
+    case 1:
+      pat = GEN_FCN (icode) (target, args[0].op);
+      break;
+
+    case 2:
+      if (tf_p)
+	pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
+			       GEN_INT ((int)sub_code));
+      else if (! comparison_p)
+	pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
+      else
+	{
+	  rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
+				       args[0].op,
+				       args[1].op);
+
+	  pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
+	}
+      break;
+
+    case 3:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  if (! pat)
+    return 0;
+
+  emit_insn (pat);
+  return target;
+}
+
 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
    insns with vec_merge.  */
 
@@ -24499,6 +24881,12 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
     if (d->code == fcode)
       return ix86_expand_sse_pcmpistr (d, exp, target);
 
+  for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
+    if (d->code == fcode)
+      return ix86_expand_multi_arg_builtin (d->icode, exp, target,
+					    (enum multi_arg_type)d->flag,
+					    d->comparison);
+
   gcc_unreachable ();
 }
 
@@ -27926,7 +28314,7 @@ static bool
 ix86_scalar_mode_supported_p (enum machine_mode mode)
 {
   if (DECIMAL_FLOAT_MODE_P (mode))
-    return true;
+    return default_decimal_float_supported_p ();
   else if (mode == TFmode)
     return true;
   else
@@ -28881,6 +29269,200 @@ ix86_expand_round (rtx operand0, rtx operand1)
   emit_move_insn (operand0, res);
 }
 
+/* Validate whether a FMA4 instruction is valid or not.
+   OPERANDS is the array of operands.
+   NUM is the number of operands.
+   USES_OC0 is true if the instruction uses OC0 and provides 4 variants.
+   NUM_MEMORY is the maximum number of memory operands to accept.
+   NUM_MEMORY less than zero is a special case to allow an operand
+   of an instruction to be memory operation.
+   when COMMUTATIVE is set, operand 1 and 2 can be swapped.  */
+
+bool
+ix86_fma4_valid_op_p (rtx operands[], rtx insn ATTRIBUTE_UNUSED, int num,
+		      bool uses_oc0, int num_memory, bool commutative)
+{
+  int mem_mask;
+  int mem_count;
+  int i;
+
+  /* Count the number of memory arguments */
+  mem_mask = 0;
+  mem_count = 0;
+  for (i = 0; i < num; i++)
+    {
+      enum machine_mode mode = GET_MODE (operands[i]);
+      if (register_operand (operands[i], mode))
+	;
+
+      else if (memory_operand (operands[i], mode))
+	{
+	  mem_mask |= (1 << i);
+	  mem_count++;
+	}
+
+      else
+	{
+	  rtx pattern = PATTERN (insn);
+
+	  /* allow 0 for pcmov */
+	  if (GET_CODE (pattern) != SET
+	      || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE
+	      || i < 2
+	      || operands[i] != CONST0_RTX (mode))
+	    return false;
+	}
+    }
+
+  /* Special case pmacsdq{l,h} where we allow the 3rd argument to be
+     a memory operation.  */
+  if (num_memory < 0)
+    {
+      num_memory = -num_memory;
+      if ((mem_mask & (1 << (num-1))) != 0)
+	{
+	  mem_mask &= ~(1 << (num-1));
+	  mem_count--;
+	}
+    }
+
+  /* If there were no memory operations, allow the insn */
+  if (mem_mask == 0)
+    return true;
+
+  /* Do not allow the destination register to be a memory operand.  */
+  else if (mem_mask & (1 << 0))
+    return false;
+
+  /* If there are too many memory operations, disallow the instruction.  While
+     the hardware only allows 1 memory reference, before register allocation
+     for some insns, we allow two memory operations sometimes in order to allow
+     code like the following to be optimized:
+
+	float fmadd (float *a, float *b, float *c) { return (*a * *b) + *c; }
+
+    or similar cases that are vectorized into using the vfmaddss
+    instruction.  */
+  else if (mem_count > num_memory)
+    return false;
+
+  /* Don't allow more than one memory operation if not optimizing.  */
+  else if (mem_count > 1 && !optimize)
+    return false;
+
+  else if (num == 4 && mem_count == 1)
+    {
+      /* formats (destination is the first argument), example vfmaddss:
+	 xmm1, xmm1, xmm2, xmm3/mem
+	 xmm1, xmm1, xmm2/mem, xmm3
+	 xmm1, xmm2, xmm3/mem, xmm1
+	 xmm1, xmm2/mem, xmm3, xmm1 */
+      if (uses_oc0)
+	return ((mem_mask == (1 << 1))
+		|| (mem_mask == (1 << 2))
+		|| (mem_mask == (1 << 3)));
+
+      /* format, example vpmacsdd:
+	 xmm1, xmm2, xmm3/mem, xmm1 */
+      if (commutative)
+	return (mem_mask == (1 << 2) || mem_mask == (1 << 1));
+      else
+	return (mem_mask == (1 << 2));
+    }
+
+  else if (num == 4 && num_memory == 2)
+    {
+      /* If there are two memory operations, we can load one of the memory ops
+	 into the destination register.  This is for optimizing the
+	 multiply/add ops, which the combiner has optimized both the multiply
+	 and the add insns to have a memory operation.  We have to be careful
+	 that the destination doesn't overlap with the inputs.  */
+      rtx op0 = operands[0];
+
+      if (reg_mentioned_p (op0, operands[1])
+	  || reg_mentioned_p (op0, operands[2])
+	  || reg_mentioned_p (op0, operands[3]))
+	return false;
+
+      /* formats (destination is the first argument), example vfmaddss:
+	 xmm1, xmm1, xmm2, xmm3/mem
+	 xmm1, xmm1, xmm2/mem, xmm3
+	 xmm1, xmm2, xmm3/mem, xmm1
+	 xmm1, xmm2/mem, xmm3, xmm1
+
+         For the oc0 case, we will load either operands[1] or operands[3] into
+         operands[0], so any combination of 2 memory operands is ok.  */
+      if (uses_oc0)
+	return true;
+
+      /* format, example vpmacsdd:
+	 xmm1, xmm2, xmm3/mem, xmm1
+
+         For the integer multiply/add instructions be more restrictive and
+         require operands[2] and operands[3] to be the memory operands.  */
+      if (commutative)
+	return (mem_mask == ((1 << 1) | (1 << 3)) || ((1 << 2) | (1 << 3)));
+      else
+	return (mem_mask == ((1 << 2) | (1 << 3)));
+    }
+
+  else if (num == 3 && num_memory == 1)
+    {
+      /* formats, example vprotb:
+	 xmm1, xmm2, xmm3/mem
+	 xmm1, xmm2/mem, xmm3 */
+      if (uses_oc0)
+	return ((mem_mask == (1 << 1)) || (mem_mask == (1 << 2)));
+
+      /* format, example vpcomeq:
+	 xmm1, xmm2, xmm3/mem */
+      else
+	return (mem_mask == (1 << 2));
+    }
+
+  else
+    gcc_unreachable ();
+
+  return false;
+}
+
+
+/* Fixup an FMA4 instruction that has 2 memory input references into a form the
+   hardware will allow by using the destination register to load one of the
+   memory operations.  Presently this is used by the multiply/add routines to
+   allow 2 memory references.  */
+
+void
+ix86_expand_fma4_multiple_memory (rtx operands[],
+				  int num,
+				  enum machine_mode mode)
+{
+  rtx op0 = operands[0];
+  if (num != 4
+      || memory_operand (op0, mode)
+      || reg_mentioned_p (op0, operands[1])
+      || reg_mentioned_p (op0, operands[2])
+      || reg_mentioned_p (op0, operands[3]))
+    gcc_unreachable ();
+
+  /* For 2 memory operands, pick either operands[1] or operands[3] to move into
+     the destination register.  */
+  if (memory_operand (operands[1], mode))
+    {
+      emit_move_insn (op0, operands[1]);
+      operands[1] = op0;
+    }
+  else if (memory_operand (operands[3], mode))
+    {
+      emit_move_insn (op0, operands[3]);
+      operands[3] = op0;
+    }
+  else
+    gcc_unreachable ();
+
+  return;
+}
+
 /* Table of valid machine attributes.  */
 static const struct attribute_spec ix86_attribute_table[] =
 {
@@ -29229,6 +29811,10 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree)
 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
 #undef TARGET_STRICT_ARGUMENT_NAMING
 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
+#undef TARGET_STATIC_CHAIN
+#define TARGET_STATIC_CHAIN ix86_static_chain
+#undef TARGET_TRAMPOLINE_INIT
+#define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
 
 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index e898a651fc9..8d525727eec 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -54,6 +54,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_AVX	OPTION_ISA_AVX
 #define TARGET_FMA	OPTION_ISA_FMA
 #define TARGET_SSE4A	OPTION_ISA_SSE4A
+#define TARGET_FMA4	OPTION_ISA_FMA4
 #define TARGET_ROUND	OPTION_ISA_ROUND
 #define TARGET_ABM	OPTION_ISA_ABM
 #define TARGET_POPCNT	OPTION_ISA_POPCNT
@@ -65,8 +66,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 #define TARGET_CMPXCHG16B OPTION_ISA_CX16
 
 
-/* SSE4.1 define round instructions */
-#define	OPTION_MASK_ISA_ROUND	(OPTION_MASK_ISA_SSE4_1)
+/* SSE4.1 defines round instructions */
+#define	OPTION_MASK_ISA_ROUND	OPTION_MASK_ISA_SSE4_1
 #define	OPTION_ISA_ROUND	((ix86_isa_flags & OPTION_MASK_ISA_ROUND) != 0)
 
 #include "config/vxworks-dummy.h"
@@ -1131,11 +1132,6 @@ enum target_cpu_default
 /* Base register for access to arguments of the function.  */
 #define ARG_POINTER_REGNUM 16
 
-/* Register in which static-chain is passed to a function.
-   We do use ECX as static chain register for 32 bit ABI.  On the
-   64bit ABI, ECX is an argument register, so we use R10 instead.  */
-#define STATIC_CHAIN_REGNUM (TARGET_64BIT ? R10_REG : CX_REG)
-
 /* Register to hold the addressing base for position independent
    code access to data items.  We don't use PIC pointer for 64bit
    mode.  Define the regnum to dummy value to prevent gcc from
@@ -1356,6 +1352,10 @@ enum reg_class
   (TARGET_AVX && ((MODE) == V4SFmode || (MODE) == V2DFmode \
 		  || (MODE) == V8SFmode || (MODE) == V4DFmode))
 
+#define FMA4_VEC_FLOAT_MODE_P(MODE) \
+  (TARGET_FMA4 && ((MODE) == V4SFmode || (MODE) == V2DFmode \
+		  || (MODE) == V8SFmode || (MODE) == V4DFmode))
+
 #define MMX_REG_P(XOP) (REG_P (XOP) && MMX_REGNO_P (REGNO (XOP)))
 #define MMX_REGNO_P(N) IN_RANGE ((N), FIRST_MMX_REG, LAST_MMX_REG)
 
@@ -1659,14 +1659,7 @@ typedef struct ix86_args {
 
 /* Length in units of the trampoline for entering a nested function.  */
 
-#define TRAMPOLINE_SIZE (TARGET_64BIT ? 23 : 10)
-
-/* Emit RTL insns to initialize the variable parts of a trampoline.
-   FNADDR is an RTX for the address of the function's pure code.
-   CXT is an RTX for the static chain value for the function.  */
-
-#define INITIALIZE_TRAMPOLINE(TRAMP, FNADDR, CXT) \
-  x86_initialize_trampoline ((TRAMP), (FNADDR), (CXT))
+#define TRAMPOLINE_SIZE (TARGET_64BIT ? 24 : 10)
 
 /* Definitions for register eliminations.
 
@@ -2365,15 +2358,29 @@ struct GTY(()) machine_function {
   const char *some_ld_name;
   int varargs_gpr_size;
   int varargs_fpr_size;
-  int accesses_prev_frame;
   int optimize_mode_switching[MAX_386_ENTITIES];
-  int needs_cld;
+
+  /* Number of saved registers USE_FAST_PROLOGUE_EPILOGUE
+     has been computed for.  */
+  int use_fast_prologue_epilogue_nregs;
+
+  /* The CFA state at the end of the prologue.  */
+  struct machine_cfa_state cfa;
+
+  /* This value is used for amd64 targets and specifies the current abi
+     to be used. MS_ABI means ms abi. Otherwise SYSV_ABI means sysv abi.  */
+  enum calling_abi call_abi;
+
+  /* Nonzero if the function accesses a previous frame.  */
+  BOOL_BITFIELD accesses_prev_frame : 1;
+
+  /* Nonzero if the function requires a CLD in the prologue.  */
+  BOOL_BITFIELD needs_cld : 1;
+
   /* Set by ix86_compute_frame_layout and used by prologue/epilogue
      expander to determine the style used.  */
-  int use_fast_prologue_epilogue;
-  /* Number of saved registers USE_FAST_PROLOGUE_EPILOGUE has been computed
-     for.  */
-  int use_fast_prologue_epilogue_nregs;
+  BOOL_BITFIELD use_fast_prologue_epilogue : 1;
+
   /* If true, the current function needs the default PIC register, not
      an alternate register (on x86) and must not use the red zone (on
      x86_64), even if it's a leaf function.  We don't want the
@@ -2383,11 +2390,11 @@ struct GTY(()) machine_function {
      if all such instructions are optimized away.  Use the
      ix86_current_function_calls_tls_descriptor macro for a better
      approximation.  */
-  int tls_descriptor_call_expanded_p;
-  /* This value is used for amd64 targets and specifies the current abi
-     to be used. MS_ABI means ms abi. Otherwise SYSV_ABI means sysv abi.  */
-  enum calling_abi call_abi;
-  struct machine_cfa_state cfa;
+  BOOL_BITFIELD tls_descriptor_call_expanded_p : 1;
+
+  /* If true, the current function has a STATIC_CHAIN is placed on the
+     stack below the return address.  */
+  BOOL_BITFIELD static_chain_on_stack : 1;
 };
 #endif
 
@@ -2406,6 +2413,7 @@ struct GTY(()) machine_function {
 #define ix86_current_function_calls_tls_descriptor \
   (ix86_tls_descriptor_calls_expanded_in_cfun && df_regs_ever_live_p (SP_REG))
 #define ix86_cfa_state (&cfun->machine->cfa)
+#define ix86_static_chain_on_stack (cfun->machine->static_chain_on_stack)
 
 /* Control behavior of x86_file_start.  */
 #define X86_FILE_START_VERSION_DIRECTIVE false
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 26bbc9a5234..5c2564e2734 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -195,6 +195,10 @@
    (UNSPEC_PCMPESTR		144)
    (UNSPEC_PCMPISTR		145)
 
+   ; For FMA4 support
+   (UNSPEC_FMA4_INTRINSIC	150)
+   (UNSPEC_FMA4_FMADDSUB	151)
+   (UNSPEC_FMA4_FMSUBADD	152)
    ; For AES support
    (UNSPEC_AESENC		159)
    (UNSPEC_AESENCLAST		160)
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index f23763b8cf6..9668ff6504d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -310,6 +310,10 @@ msse4a
 Target Report Mask(ISA_SSE4A) Var(ix86_isa_flags) VarExists Save
 Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation
 
+mfma4
+Target Report Mask(ISA_FMA4) Var(ix86_isa_flags) VarExists Save
+Support FMA4 built-in functions and code generation 
+
 mabm
 Target Report Mask(ISA_ABM) Var(ix86_isa_flags) VarExists Save
 Support code generation of Advanced Bit Manipulation (ABM) instructions.
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
index e856ecdbc97..9dcc5ba1f67 100644
--- a/gcc/config/i386/mingw32.h
+++ b/gcc/config/i386/mingw32.h
@@ -221,3 +221,7 @@ __enable_execute_stack (void *addr)					\
 #define LIBGCC_EH_EXTN "_sjlj"
 #endif
 #define LIBGCC_SONAME "libgcc_s" LIBGCC_EH_EXTN "-1.dll"
+
+/* We should find a way to not have to update this manually.  */
+#define LIBGCJ_SONAME "cyggcj" /*LIBGCC_EH_EXTN*/ "-11.dll"
+
diff --git a/gcc/config/i386/mmintrin-common.h b/gcc/config/i386/mmintrin-common.h
deleted file mode 100644
index 25fd6aa6392..00000000000
--- a/gcc/config/i386/mmintrin-common.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3, or (at your option)
-   any later version.
-
-   GCC is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* Common definition of the ROUND and PTEST intrinsics, SSE4.1.  */
-
-#ifndef _MMINTRIN_COMMON_H_INCLUDED
-#define _MMINTRIN_COMMON_H_INCLUDED
-
-#if !defined(__SSE4_1__)
-# error "SSE4.1 instruction set not enabled"
-#else
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT	0x00
-#define _MM_FROUND_TO_NEG_INF		0x01
-#define _MM_FROUND_TO_POS_INF		0x02
-#define _MM_FROUND_TO_ZERO		0x03
-#define _MM_FROUND_CUR_DIRECTION	0x04
-
-#define _MM_FROUND_RAISE_EXC		0x00
-#define _MM_FROUND_NO_EXC		0x08
-
-#define _MM_FROUND_NINT		\
-  (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_FLOOR	\
-  (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_CEIL		\
-  (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_TRUNC	\
-  (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_RINT		\
-  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
-#define _MM_FROUND_NEARBYINT	\
-  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
-
-/* Test Instruction */
-/* Packed integer 128-bit bitwise comparison. Return 1 if
-   (__V & __M) == 0.  */
-extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_testz_si128 (__m128i __M, __m128i __V)
-{
-  return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
-}
-
-/* Packed integer 128-bit bitwise comparison. Return 1 if
-   (__V & ~__M) == 0.  */
-extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_testc_si128 (__m128i __M, __m128i __V)
-{
-  return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
-}
-
-/* Packed integer 128-bit bitwise comparison. Return 1 if
-   (__V & __M) != 0 && (__V & ~__M) != 0.  */
-extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_testnzc_si128 (__m128i __M, __m128i __V)
-{
-  return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
-}
-
-/* Macros for packed integer 128-bit comparison intrinsics.  */
-#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
-
-#define _mm_test_all_ones(V) \
-  _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
-
-#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
-
-/* Packed/scalar double precision floating point rounding.  */
-
-#ifdef __OPTIMIZE__
-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_round_pd (__m128d __V, const int __M)
-{
-  return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
-}
-
-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_round_sd(__m128d __D, __m128d __V, const int __M)
-{
-  return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
-					   (__v2df)__V,
-					   __M);
-}
-#else
-#define _mm_round_pd(V, M) \
-  ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M)))
-
-#define _mm_round_sd(D, V, M)						\
-  ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D),		\
-				     (__v2df)(__m128d)(V), (int)(M)))
-#endif
-
-/* Packed/scalar single precision floating point rounding.  */
-
-#ifdef __OPTIMIZE__
-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_round_ps (__m128 __V, const int __M)
-{
-  return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
-}
-
-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_round_ss (__m128 __D, __m128 __V, const int __M)
-{
-  return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
-					  (__v4sf)__V,
-					  __M);
-}
-#else
-#define _mm_round_ps(V, M) \
-  ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M)))
-
-#define _mm_round_ss(D, V, M)						\
-  ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D),		\
-				    (__v4sf)(__m128)(V), (int)(M)))
-#endif
-
-/* Macros for ceil/floor intrinsics.  */
-#define _mm_ceil_pd(V)	   _mm_round_pd ((V), _MM_FROUND_CEIL)
-#define _mm_ceil_sd(D, V)  _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
-
-#define _mm_floor_pd(V)	   _mm_round_pd((V), _MM_FROUND_FLOOR)
-#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
-
-#define _mm_ceil_ps(V)	   _mm_round_ps ((V), _MM_FROUND_CEIL)
-#define _mm_ceil_ss(D, V)  _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
-
-#define _mm_floor_ps(V)	   _mm_round_ps ((V), _MM_FROUND_FLOOR)
-#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
-
-#endif /* __SSE4_1__ */
-
-#endif /* _MMINTRIN_COMMON_H_INCLUDED */
diff --git a/gcc/config/i386/openbsdelf.h b/gcc/config/i386/openbsdelf.h
index c76d26e2684..d3728352b58 100644
--- a/gcc/config/i386/openbsdelf.h
+++ b/gcc/config/i386/openbsdelf.h
@@ -55,10 +55,10 @@ along with GCC; see the file COPYING3.  If not see
 
 /* This must agree with <machine/ansi.h> */
 #undef SIZE_TYPE
-#define SIZE_TYPE "unsigned int"
+#define SIZE_TYPE "long unsigned int"
 
 #undef PTRDIFF_TYPE
-#define PTRDIFF_TYPE "int"
+#define PTRDIFF_TYPE "long int"
 
 #undef WCHAR_TYPE
 #define WCHAR_TYPE "int"
@@ -66,6 +66,9 @@ along with GCC; see the file COPYING3.  If not see
 #undef WCHAR_TYPE_SIZE
 #define WCHAR_TYPE_SIZE BITS_PER_WORD
 
+#undef WINT_TYPE
+#define WINT_TYPE "int"
+
 /* Assembler format: overall framework.  */
 
 #undef ASM_APP_ON
diff --git a/gcc/config/i386/smmintrin.h b/gcc/config/i386/smmintrin.h
index 1a299894960..8fbb35c9b7a 100644
--- a/gcc/config/i386/smmintrin.h
+++ b/gcc/config/i386/smmintrin.h
@@ -35,7 +35,125 @@
 /* We need definitions from the SSSE3, SSE3, SSE2 and SSE header
    files.  */
 #include <tmmintrin.h>
-#include <mmintrin-common.h>
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT	0x00
+#define _MM_FROUND_TO_NEG_INF		0x01
+#define _MM_FROUND_TO_POS_INF		0x02
+#define _MM_FROUND_TO_ZERO		0x03
+#define _MM_FROUND_CUR_DIRECTION	0x04
+
+#define _MM_FROUND_RAISE_EXC		0x00
+#define _MM_FROUND_NO_EXC		0x08
+
+#define _MM_FROUND_NINT		\
+  (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR	\
+  (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL		\
+  (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC	\
+  (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT		\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT	\
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+/* Test Instruction */
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & __M) == 0.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & ~__M) == 0.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & __M) != 0 && (__V & ~__M) != 0.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Macros for packed integer 128-bit comparison intrinsics.  */
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+#define _mm_test_all_ones(V) \
+  _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+
+/* Packed/scalar double precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_pd (__m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_sd(__m128d __D, __m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
+					   (__v2df)__V,
+					   __M);
+}
+#else
+#define _mm_round_pd(V, M) \
+  ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M)))
+
+#define _mm_round_sd(D, V, M)						\
+  ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D),		\
+				     (__v2df)(__m128d)(V), (int)(M)))
+#endif
+
+/* Packed/scalar single precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ps (__m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ss (__m128 __D, __m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
+					  (__v4sf)__V,
+					  __M);
+}
+#else
+#define _mm_round_ps(V, M) \
+  ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M)))
+
+#define _mm_round_ss(D, V, M)						\
+  ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D),		\
+				    (__v4sf)(__m128)(V), (int)(M)))
+#endif
+
+/* Macros for ceil/floor intrinsics.  */
+#define _mm_ceil_pd(V)	   _mm_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V)  _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V)	   _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V)	   _mm_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V)  _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V)	   _mm_round_ps ((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
 
 /* SSE4.1 */
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2ddbbf551de..e90296512ad 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -49,6 +49,7 @@
 (define_mode_iterator SSEMODE248 [V8HI V4SI V2DI])
 (define_mode_iterator SSEMODE1248 [V16QI V8HI V4SI V2DI])
 (define_mode_iterator SSEMODEF4 [SF DF V4SF V2DF])
+(define_mode_iterator FMA4MODEF4 [V8SF V4DF])
 (define_mode_iterator SSEMODEF2P [V4SF V2DF])
 
 (define_mode_iterator AVX256MODEF2P [V8SF V4DF])
@@ -74,6 +75,11 @@
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr ssevecsize [(V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")])
 
+;; Mapping of the fma4 suffix
+(define_mode_attr fma4modesuffixf4 [(V8SF "ps") (V4DF "pd")])
+(define_mode_attr ssemodesuffixf2s [(SF "ss") (DF "sd")
+				    (V4SF "ss") (V2DF "sd")])
+
 ;; Mapping of the avx suffix
 (define_mode_attr ssemodesuffixf4 [(SF "ss") (DF "sd")
 				   (V4SF "ps") (V2DF "pd")])
@@ -1661,6 +1667,936 @@
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
+;; FMA4 floating point multiply/accumulate instructions This includes the
+;; scalar version of the instructions as well as the vector
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; In order to match (*a * *b) + *c, particularly when vectorizing, allow
+;; combine to generate a multiply/add with two memory references.  We then
+;; split this insn, into loading up the destination register with one of the
+;; memory operations.  If we don't manage to split the insn, reload will
+;; generate the appropriate moves.  The reason this is needed, is that combine
+;; has already folded one of the memory references into both the multiply and
+;; add insns, and it can't generate a new pseudo.  I.e.:
+;;	(set (reg1) (mem (addr1)))
+;;	(set (reg2) (mult (reg1) (mem (addr2))))
+;;	(set (reg3) (plus (reg2) (mem (addr3))))
+
+(define_insn "fma4_fmadd<mode>4256"
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x,x")
+	(plus:FMA4MODEF4
+	 (mult:FMA4MODEF4
+	  (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x,xm")
+	  (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm,x"))
+	 (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x,x")))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fmadd with two memory operands into a load and the fmadd.
+(define_split
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "")
+	(plus:FMA4MODEF4
+	 (mult:FMA4MODEF4
+	  (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "")
+	  (match_operand:FMA4MODEF4 2 "nonimmediate_operand" ""))
+	 (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")))]
+  "TARGET_FMA4
+   && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_fma4_fmadd<mode>4256 (operands[0], operands[1],
+				    operands[2], operands[3]));
+  DONE;
+})
+
+;; Floating multiply and subtract
+;; Allow two memory operands the same as fmadd
+(define_insn "fma4_fmsub<mode>4256"
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x,x")
+	(minus:FMA4MODEF4
+	 (mult:FMA4MODEF4
+	  (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x,xm")
+	  (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm,x"))
+	 (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x,x")))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fmsub with two memory operands into a load and the fmsub.
+(define_split
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "")
+	(minus:FMA4MODEF4
+	 (mult:FMA4MODEF4
+	  (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "")
+	  (match_operand:FMA4MODEF4 2 "nonimmediate_operand" ""))
+	 (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")))]
+  "TARGET_FMA4
+   && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_fma4_fmsub<mode>4256 (operands[0], operands[1],
+				    operands[2], operands[3]));
+  DONE;
+})
+
+;; Floating point negative multiply and add
+;; Rewrite (- (a * b) + c) into the canonical form: c - (a * b)
+;; Note operands are out of order to simplify call to ix86_fma4_valid_p
+;; Allow two memory operands to help in optimizing.
+(define_insn "fma4_fnmadd<mode>4256"
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x,x")
+	(minus:FMA4MODEF4
+	 (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x,x")
+	 (mult:FMA4MODEF4
+	  (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x,xm")
+	  (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm,x"))))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfnmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fnmadd with two memory operands into a load and the fnmadd.
+(define_split
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "")
+	(minus:FMA4MODEF4
+	 (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")
+	 (mult:FMA4MODEF4
+	  (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "")
+	  (match_operand:FMA4MODEF4 2 "nonimmediate_operand" ""))))]
+  "TARGET_FMA4
+   && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_fma4_fnmadd<mode>4256 (operands[0], operands[1],
+				     operands[2], operands[3]));
+  DONE;
+})
+
+;; Floating point negative multiply and subtract
+;; Rewrite (- (a * b) - c) into the canonical form: ((-a) * b) - c
+;; Allow 2 memory operands to help with optimization
+(define_insn "fma4_fnmsub<mode>4256"
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+	(minus:FMA4MODEF4
+	 (mult:FMA4MODEF4
+	  (neg:FMA4MODEF4
+	   (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x"))
+	  (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm"))
+	 (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x")))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)"
+  "vfnmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fnmsub with two memory operands into a load and the fmsub.
+(define_split
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "")
+	(minus:FMA4MODEF4
+	 (mult:FMA4MODEF4
+	  (neg:FMA4MODEF4
+	   (match_operand:FMA4MODEF4 1 "nonimmediate_operand" ""))
+	  (match_operand:FMA4MODEF4 2 "nonimmediate_operand" ""))
+	 (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "")))]
+  "TARGET_FMA4
+   && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_fma4_fnmsub<mode>4256 (operands[0], operands[1],
+				        operands[2], operands[3]));
+  DONE;
+})
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(define_insn "fma4_fmadd<mode>4"
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x")
+	(plus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x,xm")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,x"))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,x")))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fmadd with two memory operands into a load and the fmadd.
+(define_split
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+	(plus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))]
+  "TARGET_FMA4
+   && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_fma4_fmadd<mode>4 (operands[0], operands[1],
+				    operands[2], operands[3]));
+  DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fmadd
+(define_insn "fma4_vmfmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(vec_merge:SSEMODEF2P
+	 (plus:SSEMODEF2P
+	  (mult:SSEMODEF2P
+	   (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+	   (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	  (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	 (match_dup 0)
+	 (const_int 1)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+  "vfmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Floating multiply and subtract
+;; Allow two memory operands the same as fmadd
+(define_insn "fma4_fmsub<mode>4"
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x")
+	(minus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x,xm")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,x"))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,x")))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fmsub with two memory operands into a load and the fmsub.
+(define_split
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+	(minus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))]
+  "TARGET_FMA4
+   && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_fma4_fmsub<mode>4 (operands[0], operands[1],
+				    operands[2], operands[3]));
+  DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fmsub
+(define_insn "fma4_vmfmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(vec_merge:SSEMODEF2P
+	 (minus:SSEMODEF2P
+	  (mult:SSEMODEF2P
+	   (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+	   (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	  (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	 (match_dup 0)
+	 (const_int 1)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+  "vfmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Floating point negative multiply and add
+;; Rewrite (- (a * b) + c) into the canonical form: c - (a * b)
+;; Note operands are out of order to simplify call to ix86_fma4_valid_p
+;; Allow two memory operands to help in optimizing.
+(define_insn "fma4_fnmadd<mode>4"
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x,x")
+	(minus:SSEMODEF4
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x,x")
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x,xm")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm,x"))))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfnmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fnmadd with two memory operands into a load and the fnmadd.
+(define_split
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+	(minus:SSEMODEF4
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")
+	 (mult:SSEMODEF4
+	  (match_operand:SSEMODEF4 1 "nonimmediate_operand" "")
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))))]
+  "TARGET_FMA4
+   && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_fma4_fnmadd<mode>4 (operands[0], operands[1],
+				     operands[2], operands[3]));
+  DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fnmadd
+(define_insn "fma4_vmfnmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(vec_merge:SSEMODEF2P
+	 (minus:SSEMODEF2P
+	  (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")
+	  (mult:SSEMODEF2P
+	   (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+	   (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")))
+	 (match_dup 0)
+	 (const_int 1)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+  "vfnmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Floating point negative multiply and subtract
+;; Rewrite (- (a * b) - c) into the canonical form: ((-a) * b) - c
+;; Allow 2 memory operands to help with optimization
+(define_insn "fma4_fnmsub<mode>4"
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "=x,x")
+	(minus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (neg:SSEMODEF4
+	   (match_operand:SSEMODEF4 1 "nonimmediate_operand" "x,x"))
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" "x,xm"))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "xm,x")))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)"
+  "vfnmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; Split fnmsub with two memory operands into a load and the fmsub.
+(define_split
+  [(set (match_operand:SSEMODEF4 0 "register_operand" "")
+	(minus:SSEMODEF4
+	 (mult:SSEMODEF4
+	  (neg:SSEMODEF4
+	   (match_operand:SSEMODEF4 1 "nonimmediate_operand" ""))
+	  (match_operand:SSEMODEF4 2 "nonimmediate_operand" ""))
+	 (match_operand:SSEMODEF4 3 "nonimmediate_operand" "")))]
+  "TARGET_FMA4
+   && !ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[0], operands[2])
+   && !reg_mentioned_p (operands[0], operands[3])"
+  [(const_int 0)]
+{
+  ix86_expand_fma4_multiple_memory (operands, 4, <MODE>mode);
+  emit_insn (gen_fma4_fnmsub<mode>4 (operands[0], operands[1],
+				     operands[2], operands[3]));
+  DONE;
+})
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are generated.
+;; Scalar version of fnmsub
+(define_insn "fma4_vmfnmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(vec_merge:SSEMODEF2P
+	 (minus:SSEMODEF2P
+	  (mult:SSEMODEF2P
+	   (neg:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x"))
+	   (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	  (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	 (match_dup 0)
+	 (const_int 1)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, false)"
+  "vfnmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "fma4i_fmadd<mode>4256"
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+	(unspec:FMA4MODEF4
+	 [(plus:FMA4MODEF4
+	   (mult:FMA4MODEF4
+	    (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x")
+	    (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm"))
+	   (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x"))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+  "vfmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fmsub<mode>4256"
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+	(unspec:FMA4MODEF4
+	 [(minus:FMA4MODEF4
+	   (mult:FMA4MODEF4
+	    (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x")
+	    (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm"))
+	   (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x"))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+  "vfmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fnmadd<mode>4256"
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+	(unspec:FMA4MODEF4
+	 [(minus:FMA4MODEF4
+	   (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x")
+	   (mult:FMA4MODEF4
+	    (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x")
+	    (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm")))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+  "vfnmadd<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fnmsub<mode>4256"
+  [(set (match_operand:FMA4MODEF4 0 "register_operand" "=x,x")
+	(unspec:FMA4MODEF4
+	 [(minus:FMA4MODEF4
+	   (mult:FMA4MODEF4
+	    (neg:FMA4MODEF4
+	     (match_operand:FMA4MODEF4 1 "nonimmediate_operand" "x,x"))
+	    (match_operand:FMA4MODEF4 2 "nonimmediate_operand" "x,xm"))
+	   (match_operand:FMA4MODEF4 3 "nonimmediate_operand" "xm,x"))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+  "vfnmsub<fma4modesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "fma4i_fmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(plus:SSEMODEF2P
+	   (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+  "vfmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(minus:SSEMODEF2P
+	   (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+  "vfmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fnmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(minus:SSEMODEF2P
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")
+	   (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+  "vfnmadd<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "fma4i_fnmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(minus:SSEMODEF2P
+	   (mult:SSEMODEF2P
+	    (neg:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x"))
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	   (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+  "vfnmsub<ssemodesuffixf4>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; For the scalar operations, use operand1 for the upper words that aren't
+;; modified, so restrict the forms that are accepted.
+(define_insn "fma4i_vmfmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (plus:SSEMODEF2P
+	    (mult:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "register_operand" "x,x")
+	     (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	   (match_dup 0)
+	   (const_int 1))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+  "vfmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "fma4i_vmfmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (minus:SSEMODEF2P
+	    (mult:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "register_operand" "x,x")
+	     (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	   (match_dup 0)
+	   (const_int 1))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+  "vfmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "fma4i_vmfnmadd<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (minus:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x")
+	    (mult:SSEMODEF2P
+	     (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "x,x")
+	     (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm")))
+	   (match_dup 0)
+	   (const_int 1))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, true)"
+  "vfnmadd<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<ssescalarmode>")])
+
+(define_insn "fma4i_vmfnmsub<mode>4"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x,x")
+	(unspec:SSEMODEF2P
+	 [(vec_merge:SSEMODEF2P
+	   (minus:SSEMODEF2P
+	    (mult:SSEMODEF2P
+	     (neg:SSEMODEF2P
+	      (match_operand:SSEMODEF2P 1 "register_operand" "x,x"))
+	     (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:SSEMODEF2P 3 "nonimmediate_operand" "xm,x"))
+	   (match_dup 0)
+	   (const_int 1))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4 && ix86_fma4_valid_op_p (operands, insn, 4, true, 1, false)"
+  "vfnmsub<ssemodesuffixf2s>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "<ssescalarmode>")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; FMA4 Parallel floating point multiply addsub and subadd operations
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "fma4_fmaddsubv8sf4"
+  [(set (match_operand:V8SF 0 "register_operand" "=x,x")
+	(vec_merge:V8SF
+	  (plus:V8SF
+	    (mult:V8SF
+	      (match_operand:V8SF 1 "nonimmediate_operand" "x,x")
+	      (match_operand:V8SF 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:V8SF 3 "nonimmediate_operand" "xm,x"))
+	  (minus:V8SF
+	    (mult:V8SF
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))
+	  (const_int 170)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "fma4_fmaddsubv4df4"
+  [(set (match_operand:V4DF 0 "register_operand" "=x,x")
+	(vec_merge:V4DF
+	  (plus:V4DF
+	    (mult:V4DF
+	      (match_operand:V4DF 1 "nonimmediate_operand" "x,x")
+	      (match_operand:V4DF 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:V4DF 3 "nonimmediate_operand" "xm,x"))
+	  (minus:V4DF
+	    (mult:V4DF
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))
+	  (const_int 10)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "fma4_fmaddsubv4sf4"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_merge:V4SF
+	  (plus:V4SF
+	    (mult:V4SF
+	      (match_operand:V4SF 1 "nonimmediate_operand" "x,x")
+	      (match_operand:V4SF 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:V4SF 3 "nonimmediate_operand" "xm,x"))
+	  (minus:V4SF
+	    (mult:V4SF
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))
+	  (const_int 10)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "fma4_fmaddsubv2df4"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(vec_merge:V2DF
+	  (plus:V2DF
+	    (mult:V2DF
+	      (match_operand:V2DF 1 "nonimmediate_operand" "x,x")
+	      (match_operand:V2DF 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:V2DF 3 "nonimmediate_operand" "xm,x"))
+	  (minus:V2DF
+	    (mult:V2DF
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))
+	  (const_int 2)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "fma4_fmsubaddv8sf4"
+  [(set (match_operand:V8SF 0 "register_operand" "=x,x")
+	(vec_merge:V8SF
+	  (plus:V8SF
+	    (mult:V8SF
+	      (match_operand:V8SF 1 "nonimmediate_operand" "x,x")
+	      (match_operand:V8SF 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:V8SF 3 "nonimmediate_operand" "xm,x"))
+	  (minus:V8SF
+	    (mult:V8SF
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))
+	  (const_int 85)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "fma4_fmsubaddv4df4"
+  [(set (match_operand:V4DF 0 "register_operand" "=x,x")
+	(vec_merge:V4DF
+	  (plus:V4DF
+	    (mult:V4DF
+	      (match_operand:V4DF 1 "nonimmediate_operand" "x,x")
+	      (match_operand:V4DF 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:V4DF 3 "nonimmediate_operand" "xm,x"))
+	  (minus:V4DF
+	    (mult:V4DF
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))
+	  (const_int 5)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "fma4_fmsubaddv4sf4"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(vec_merge:V4SF
+	  (plus:V4SF
+	    (mult:V4SF
+	      (match_operand:V4SF 1 "nonimmediate_operand" "x,x")
+	      (match_operand:V4SF 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:V4SF 3 "nonimmediate_operand" "xm,x"))
+	  (minus:V4SF
+	    (mult:V4SF
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))
+	  (const_int 5)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "fma4_fmsubaddv2df4"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(vec_merge:V2DF
+	  (plus:V2DF
+	    (mult:V2DF
+	      (match_operand:V2DF 1 "nonimmediate_operand" "x,x")
+	      (match_operand:V2DF 2 "nonimmediate_operand" "x,xm"))
+	    (match_operand:V2DF 3 "nonimmediate_operand" "xm,x"))
+	  (minus:V2DF
+	    (mult:V2DF
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))
+	  (const_int 1)))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V2DF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "fma4i_fmaddsubv8sf4"
+  [(set (match_operand:V8SF 0 "register_operand" "=x,x")
+	(unspec:V8SF
+	 [(vec_merge:V8SF
+	   (plus:V8SF
+	     (mult:V8SF
+	       (match_operand:V8SF 1 "nonimmediate_operand" "x,x")
+	       (match_operand:V8SF 2 "nonimmediate_operand" "x,xm"))
+	     (match_operand:V8SF 3 "nonimmediate_operand" "xm,x"))
+	   (minus:V8SF
+	     (mult:V8SF
+	       (match_dup 1)
+	       (match_dup 2))
+	     (match_dup 3))
+	   (const_int 170))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "fma4i_fmaddsubv4df4"
+  [(set (match_operand:V4DF 0 "register_operand" "=x,x")
+	(unspec:V4DF
+	 [(vec_merge:V4DF
+	   (plus:V4DF
+	     (mult:V4DF
+	       (match_operand:V4DF 1 "nonimmediate_operand" "x,x")
+	       (match_operand:V4DF 2 "nonimmediate_operand" "x,xm"))
+	     (match_operand:V4DF 3 "nonimmediate_operand" "xm,x"))
+	   (minus:V4DF
+	     (mult:V4DF
+	       (match_dup 1)
+	       (match_dup 2))
+	     (match_dup 3))
+	   (const_int 10))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "fma4i_fmaddsubv4sf4"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(unspec:V4SF
+	 [(vec_merge:V4SF
+	   (plus:V4SF
+	     (mult:V4SF
+	       (match_operand:V4SF 1 "nonimmediate_operand" "x,x")
+	       (match_operand:V4SF 2 "nonimmediate_operand" "x,xm"))
+	     (match_operand:V4SF 3 "nonimmediate_operand" "xm,x"))
+	   (minus:V4SF
+	     (mult:V4SF
+	       (match_dup 1)
+	       (match_dup 2))
+	     (match_dup 3))
+	   (const_int 10))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmaddsubps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "fma4i_fmaddsubv2df4"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(unspec:V2DF
+	 [(vec_merge:V2DF
+	   (plus:V2DF
+	     (mult:V2DF
+	       (match_operand:V2DF 1 "nonimmediate_operand" "x,x")
+	       (match_operand:V2DF 2 "nonimmediate_operand" "x,xm"))
+	     (match_operand:V2DF 3 "nonimmediate_operand" "xm,x"))
+	   (minus:V2DF
+	     (mult:V2DF
+	       (match_dup 1)
+	       (match_dup 2))
+	     (match_dup 3))
+	   (const_int 2))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmaddsubpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "fma4i_fmsubaddv8sf4"
+  [(set (match_operand:V8SF 0 "register_operand" "=x,x")
+	(unspec:V8SF
+	 [(vec_merge:V8SF
+	   (plus:V8SF
+	     (mult:V8SF
+	       (match_operand:V8SF 1 "nonimmediate_operand" "x,x")
+	       (match_operand:V8SF 2 "nonimmediate_operand" "x,xm"))
+	     (match_operand:V8SF 3 "nonimmediate_operand" "xm,x"))
+	   (minus:V8SF
+	     (mult:V8SF
+	       (match_dup 1)
+	       (match_dup 2))
+	     (match_dup 3))
+	   (const_int 85))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "fma4i_fmsubaddv4df4"
+  [(set (match_operand:V4DF 0 "register_operand" "=x,x")
+	(unspec:V4DF
+	 [(vec_merge:V4DF
+	   (plus:V4DF
+	     (mult:V4DF
+	       (match_operand:V4DF 1 "nonimmediate_operand" "x,x")
+	       (match_operand:V4DF 2 "nonimmediate_operand" "x,xm"))
+	     (match_operand:V4DF 3 "nonimmediate_operand" "xm,x"))
+	   (minus:V4DF
+	     (mult:V4DF
+	       (match_dup 1)
+	       (match_dup 2))
+	     (match_dup 3))
+	   (const_int 5))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "fma4i_fmsubaddv4sf4"
+  [(set (match_operand:V4SF 0 "register_operand" "=x,x")
+	(unspec:V4SF
+	 [(vec_merge:V4SF
+	   (plus:V4SF
+	     (mult:V4SF
+	       (match_operand:V4SF 1 "nonimmediate_operand" "x,x")
+	       (match_operand:V4SF 2 "nonimmediate_operand" "x,xm"))
+	     (match_operand:V4SF 3 "nonimmediate_operand" "xm,x"))
+	   (minus:V4SF
+	     (mult:V4SF
+	       (match_dup 1)
+	       (match_dup 2))
+	     (match_dup 3))
+	   (const_int 5))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsubaddps\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "fma4i_fmsubaddv2df4"
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
+	(unspec:V2DF
+	 [(vec_merge:V2DF
+	   (plus:V2DF
+	     (mult:V2DF
+	       (match_operand:V2DF 1 "nonimmediate_operand" "x,x")
+	       (match_operand:V2DF 2 "nonimmediate_operand" "x,xm"))
+	     (match_operand:V2DF 3 "nonimmediate_operand" "xm,x"))
+	   (minus:V2DF
+	     (mult:V2DF
+	       (match_dup 1)
+	       (match_dup 2))
+	     (match_dup 3))
+	   (const_int 1))]
+	 UNSPEC_FMA4_INTRINSIC))]
+  "TARGET_FMA4
+   && ix86_fma4_valid_op_p (operands, insn, 4, true, 2, true)"
+  "vfmsubaddpd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemuladd")
+   (set_attr "mode" "V2DF")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
 ;; Parallel single-precision floating point conversion operations
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/gcc/config/i386/x86intrin.h b/gcc/config/i386/x86intrin.h
index 705300c23eb..7bc47f8f15d 100644
--- a/gcc/config/i386/x86intrin.h
+++ b/gcc/config/i386/x86intrin.h
@@ -46,7 +46,7 @@
 #include <tmmintrin.h>
 #endif
 
-#ifdef __SSE4a__
+#ifdef __SSE4A__
 #include <ammintrin.h>
 #endif
 
@@ -54,8 +54,8 @@
 #include <smmintrin.h>
 #endif
 
-#ifdef __SSE5__
-#include <bmmintrin.h>
+#ifdef __FMA4__
+#include <fma4intrin.h>
 #endif
 
 #if defined (__AES__) || defined (__PCLMUL__)
author	bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>	2009-09-30 14:23:29 +0000
committer	bstarynk <bstarynk@138bc75d-0d04-0410-961f-82ee72b054a4>	2009-09-30 14:23:29 +0000
commit	92b17925e053f1044328100acee8e6ddca553d05 (patch)
tree	56dda32931a24ed4b4924be0c66a5f961af318ec /gcc/config/i386
parent	833adb8d7a201ed21bff6fa9d2f9ed95c105dbc3 (diff)
download	gcc-92b17925e053f1044328100acee8e6ddca553d05.tar.gz