16 files changed, 2037 insertions, 229 deletions
diff --git a/gcc/config/sh/crt1.asm b/gcc/config/sh/crt1.asm
index c110fa07427..7aa684434d7 100644
--- a/gcc/config/sh/crt1.asm
+++ b/gcc/config/sh/crt1.asm
@@ -1,4 +1,5 @@
-/* Copyright (C) 2000, 2001, 2003, 2004, 2005 Free Software Foundation, Inc.
+/* Copyright (C) 2000, 2001, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
    This file was pretty much copied from newlib.
 
 This file is part of GCC.
@@ -894,25 +895,12 @@ ___main:
 	nop
 #ifdef VBR_SETUP
 ! Exception handlers	
-	.balign 256
+	.section .text.vbr, "ax"
 vbr_start:
-	mov.l 2f, r0     ! load the old vbr setting (if any)
-	mov.l @r0, r0
-	cmp/eq #0, r0
-	bf 1f
-	! no previous vbr - jump to own generic handler
-	bra handler
-	nop
-1:	! there was a previous handler - chain them
-	jmp @r0
-	nop
-	.balign 4
-2:
-	.long old_vbr
 
-	.balign 256
+	.org 0x100
 vbr_100:
-	#ifdef PROFILE
+#ifdef PROFILE
 	! Note on register usage.
 	! we use r0..r3 as scratch in this code. If we are here due to a trapa for profiling
 	! then this is OK as we are just before executing any function code.
@@ -1017,50 +1005,7 @@ handler_100:
 2:	
 	.long old_vbr
 
-	.balign 256
-vbr_200:
-	mov.l 2f, r0     ! load the old vbr setting (if any)
-	mov.l @r0, r0
-	cmp/eq #0, r0
-	bf 1f
-	! no previous vbr - jump to own generic handler
-	bra handler
-	nop	
-1:	! there was a previous handler - chain them
-	add #0x7f, r0	 ! 0x7f
-	add #0x7f, r0	 ! 0xfe
-	add #0x7f, r0	 ! 0x17d
-	add #0x7f, r0    ! 0x1fc
-	add #0x4, r0     ! add 0x200 without corrupting another register
-	jmp @r0
-	nop
-	.balign 4
-2:
-	.long old_vbr
-
-	.balign 256
-vbr_300:
-	mov.l 2f, r0     ! load the old vbr setting (if any)
-	mov.l @r0, r0
-	cmp/eq #0, r0
-	bf 1f
-	! no previous vbr - jump to own generic handler
-	bra handler
-	nop	
-1:	! there was a previous handler - chain them
-	rotcr r0
-	rotcr r0
-	add #0x7f, r0	 ! 0x1fc
-	add #0x41, r0	 ! 0x300
-	rotcl r0
-	rotcl r0	 ! Add 0x300 without corrupting another register
-	jmp @r0
-	nop
-	.balign 4
-2:
-	.long old_vbr
-
-	.balign 256	
+	.org 0x400
 vbr_400:	! Should be at vbr+0x400
 	mov.l 2f, r0     ! load the old vbr setting (if any)
 	mov.l @r0, r0
@@ -1103,28 +1048,7 @@ handler:
 	jmp @r2
 	nop
 
-	.balign 256
-vbr_500:
-	mov.l 2f, r0     ! load the old vbr setting (if any)
-	mov.l @r0, r0
-	cmp/eq #0, r0
-	! no previous vbr - jump to own generic handler
-	bt handler
-	! there was a previous handler - chain them
-	rotcr r0
-	rotcr r0
-	add #0x7f, r0	 ! 0x1fc
-	add #0x7f, r0	 ! 0x3f8
-	add #0x42, r0	 ! 0x500
-	rotcl r0
-	rotcl r0	 ! Add 0x500 without corrupting another register
-	jmp @r0
-	nop
-	.balign 4
-2:
-	.long old_vbr
-
-	.balign 256
+	.org 0x600
 vbr_600:
 #ifdef PROFILE	
 	! Should be at vbr+0x600
@@ -1140,11 +1064,48 @@ vbr_600:
 	mov.l	r6,@-r15
 	mov.l	r7,@-r15
 	sts.l	pr,@-r15
+	sts.l	mach,@-r15
+	sts.l	macl,@-r15
+#if defined(__SH_FPU_ANY__)
+	! Save fpul and fpscr, save fr0-fr7 in 64 bit mode
+	! and set the pervading precision for the timer_handler
+	mov	#0,r0
+	sts.l	fpul,@-r15
+	sts.l	fpscr,@-r15
+	lds	r0,fpscr	! Clear fpscr
+	fmov	fr0,@-r15
+	fmov	fr1,@-r15
+	fmov	fr2,@-r15
+	fmov	fr3,@-r15
+	mov.l	pervading_precision_k,r0
+	fmov	fr4,@-r15
+	fmov	fr5,@-r15
+	mov.l	@r0,r0
+	fmov	fr6,@-r15
+	fmov	fr7,@-r15
+	lds	r0,fpscr
+#endif /* __SH_FPU_ANY__ */
 	! Pass interrupted pc to timer_handler as first parameter (r4).
 	stc    spc, r4
 	mov.l timer_handler_k, r0
 	jsr @r0
 	nop
+#if defined(__SH_FPU_ANY__)
+	mov	#0,r0
+	lds	r0,fpscr	! Clear the fpscr
+	fmov	@r15+,fr7
+	fmov	@r15+,fr6
+	fmov	@r15+,fr5
+	fmov	@r15+,fr4
+	fmov	@r15+,fr3
+	fmov	@r15+,fr2
+	fmov	@r15+,fr1
+	fmov	@r15+,fr0
+	lds.l	@r15+,fpscr
+	lds.l	@r15+,fpul
+#endif /* __SH_FPU_ANY__ */
+	lds.l @r15+,macl
+	lds.l @r15+,mach
 	lds.l @r15+,pr
 	mov.l @r15+,r7
 	mov.l @r15+,r6
@@ -1157,6 +1118,13 @@ vbr_600:
 	stc sgr, r15    ! Restore r15, destroyed by this sequence. 
 	rte
 	nop
+#if defined(__SH_FPU_ANY__)
+	.balign 4
+pervading_precision_k:
+#define CONCAT1(A,B) A##B
+#define CONCAT(A,B) CONCAT1(A,B)
+	.long CONCAT(__USER_LABEL_PREFIX__,__fpscr_values)+4
+#endif
 #else
 	mov.l 2f, r0     ! Load the old vbr setting (if any).
 	mov.l @r0, r0
diff --git a/gcc/config/sh/divcost-analysis b/gcc/config/sh/divcost-analysis
index 541e31324b3..0296269bb52 100644
--- a/gcc/config/sh/divcost-analysis
+++ b/gcc/config/sh/divcost-analysis
@@ -38,12 +38,17 @@ div_r8_neg -> div_r8_neg_end: 18
 div_le128_neg -> div_by_1_neg: 4
 div_le128_neg -> rts          18
 
-                    absolute divisor range:
+         sh4-200    absolute divisor range:
             1  [2..128]  [129..64K) [64K..|divident|/256] >=64K,>|divident/256|
 udiv       18     22         38            32                   30
 sdiv pos:  20     24         41            35                   32
 sdiv neg:  15     25         42            36                   33
 
+         sh4-300    absolute divisor range:
+                 8 bit      16 bit       24 bit              > 24 bit
+udiv              15         35            28                   25
+sdiv              14         36            34                   31
+
 
 fp-based:
 
diff --git a/gcc/config/sh/divtab-sh4-300.c b/gcc/config/sh/divtab-sh4-300.c
new file mode 100644
index 00000000000..448b0b8af8e
--- /dev/null
+++ b/gcc/config/sh/divtab-sh4-300.c
@@ -0,0 +1,81 @@
+/* Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+/* Calculate division table for ST40-300 integer division
+   Contributed by Joern Rennecke
+   joern.rennecke@st.com  */
+
+#include <stdio.h>
+#include <math.h>
+
+int
+main ()
+{
+  int i, j;
+  double q, r, err, max_err = 0, max_s_err = 0;
+
+  puts("/* This table has been generated by divtab-sh4.c.  */");
+  puts ("\t.balign 4");
+  for (i = -128; i < 128; i++)
+    {
+      int n = 0;
+      if (i == 0)
+	{
+	  /* output some dummy number for 1/0.  */
+	  puts ("LOCAL(div_table_clz):\n\t.byte\t0");
+	  continue;
+	}
+      for (j = i < 0 ? -i : i; j < 128; j += j)
+	n++;
+      printf ("\t.byte\t%d\n", n - 7);
+    }
+  puts("\
+/* 1/-128 .. 1/127, normalized.  There is an implicit leading 1 in bit 32,\n\
+   or in bit 33 for powers of two.  */\n\
+	.balign 4");
+  for (i = -128; i < 128; i++)
+    {
+      if (i == 0)
+	{
+	  puts ("LOCAL(div_table_inv):\n\t.long\t0x0");
+	  continue;
+	}
+      j = i < 0 ? -i : i;
+      while (j < 64)
+	j += j;
+      q = 4.*(1<<30)*128/j;
+      r = ceil (q);
+      printf ("\t.long\t0x%X\n", (unsigned) r);
+      err = r - q;
+      if (err > max_err)
+	max_err = err;
+      err = err * j / 128;
+      if (err > max_s_err)
+	max_s_err = err;
+    }
+  printf ("\t/* maximum error: %f scaled: %f*/\n", max_err, max_s_err);
+  exit (0);
+}
diff --git a/gcc/config/sh/embed-elf.h b/gcc/config/sh/embed-elf.h
index 4497cf34636..0d817cacf85 100644
--- a/gcc/config/sh/embed-elf.h
+++ b/gcc/config/sh/embed-elf.h
@@ -32,6 +32,7 @@ Boston, MA 02110-1301, USA.  */
 #define LIBGCC_SPEC "%{!shared: \
   %{m4-100*:-lic_invalidate_array_4-100} \
   %{m4-200*:-lic_invalidate_array_4-200} \
+  %{m4-300*|-m4-340:-lic_invalidate_array_4a %{!Os: -lgcc-4-300}} \
   %{m4a*:-lic_invalidate_array_4a}} \
   %{Os: -lgcc-Os-4-200} \
   -lgcc \
diff --git a/gcc/config/sh/lib1funcs-4-300.asm b/gcc/config/sh/lib1funcs-4-300.asm
new file mode 100644
index 00000000000..b07912425af
--- /dev/null
+++ b/gcc/config/sh/lib1funcs-4-300.asm
@@ -0,0 +1,938 @@
+/* Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+/* libgcc routines for the STMicroelectronics ST40-300 CPU.
+   Contributed by J"orn Rennecke joern.rennecke@st.com.  */
+
+#include "lib1funcs.h"
+
+#ifdef L_div_table
+#if defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__)
+/* This code used shld, thus is not suitable for SH1 / SH2.  */
+
+/* Signed / unsigned division without use of FPU, optimized for SH4-300.
+   Uses a lookup table for divisors in the range -128 .. +127, and
+   div1 with case distinction for larger divisors in three more ranges.
+   The code is lumped together with the table to allow the use of mova.  */
+#ifdef __LITTLE_ENDIAN__
+#define L_LSB 0
+#define L_LSWMSB 1
+#define L_MSWLSB 2
+#else
+#define L_LSB 3
+#define L_LSWMSB 2
+#define L_MSWLSB 1
+#endif
+
+	.global	GLOBAL(udivsi3_i4i)
+	.global	GLOBAL(sdivsi3_i4i)
+	FUNC(GLOBAL(udivsi3_i4i))
+	FUNC(GLOBAL(sdivsi3_i4i))
+
+	.balign 4
+LOCAL(div_ge8m): ! 10 cycles up to here
+	rotcr r1 ! signed shift must use original sign from r4
+	div0s r5,r4
+	mov #24,r7
+	shld r7,r6
+	shad r0,r1
+	rotcl r6
+	div1 r5,r1
+	swap.w r5,r0 ! detect -0x80000000 : 0x800000
+	rotcl r6
+	swap.w r4,r7
+	div1 r5,r1
+	swap.b r7,r7
+	rotcl r6
+	or r7,r0
+	div1 r5,r1
+	swap.w r0,r7
+	rotcl r6
+	or r7,r0
+	div1 r5,r1
+	add #-0x80,r0
+	rotcl r6
+	extu.w r0,r0
+	div1 r5,r1
+	neg r0,r0
+	rotcl r6
+	swap.w r0,r0
+	div1 r5,r1
+	mov.l @r15+,r7
+	and r6,r0
+	rotcl r6
+	div1 r5,r1
+	shll2 r0
+	rotcl r6
+	exts.b r0,r0
+	div1 r5,r1
+	swap.w r0,r0
+	exts.w r0,r1
+	exts.b r6,r0
+	mov.l @r15+,r6
+	rotcl r0
+	rts
+	sub r1,r0
+	! 31 cycles up to here
+
+	.balign 4
+LOCAL(udiv_ge64k): ! 3 cycles up to here
+	mov r4,r0
+	shlr8 r0
+	div0u
+	cmp/hi r0,r5
+	bt LOCAL(udiv_r8)
+	mov.l r5,@-r15
+	shll8 r5
+	! 7 cycles up to here
+	.rept 8
+	div1 r5,r0
+	.endr
+	extu.b r4,r1 ! 15 cycles up to here
+	extu.b r0,r6
+	xor r1,r0
+	xor r6,r0
+	swap.b r6,r6
+	.rept 8
+	div1 r5,r0
+	.endr ! 25 cycles up to here
+	extu.b r0,r0
+	mov.l @r15+,r5
+	or r6,r0
+	mov.l @r15+,r6
+	rts
+	rotcl r0 ! 28 cycles up to here
+
+	.balign 4
+LOCAL(udiv_r8): ! 6 cycles up to here
+	mov.l r4,@-r15
+	shll16 r4
+	shll8 r4
+	!
+	shll r4
+	mov r0,r1
+	div1 r5,r1
+	mov r4,r0
+	rotcl r0
+	mov.l @r15+,r4
+	div1 r5,r1
+	! 12 cycles up to here
+	.rept 6
+	rotcl r0; div1 r5,r1
+	.endr
+	mov.l @r15+,r6 ! 24 cycles up to here
+	rts
+	rotcl r0
+
+	.balign 4
+LOCAL(div_ge32k): ! 6 cycles up to here
+	mov.l r7,@-r15
+	swap.w r5,r6
+	exts.b r6,r7
+	exts.w r6,r6
+	cmp/eq r6,r7
+	extu.b r1,r6
+	bf/s LOCAL(div_ge8m)
+	cmp/hi r1,r4 ! copy sign bit of r4 into T
+	rotcr r1 ! signed shift must use original sign from r4
+	div0s r5,r4
+	shad r0,r1
+	shll8 r5
+	div1 r5,r1
+	mov r5,r7 ! detect r4 == 0x80000000 && r5 == 0x8000(00)
+	div1 r5,r1
+	shlr8 r7
+	div1 r5,r1
+	swap.w r4,r0
+	div1 r5,r1
+	swap.b r0,r0
+	div1 r5,r1
+	or r0,r7
+	div1 r5,r1
+	add #-80,r7
+	div1 r5,r1
+	swap.w r7,r0
+	div1 r5,r1
+	or r0,r7
+	extu.b r1,r0
+	xor r6,r1
+	xor r0,r1
+	exts.b r0,r0
+	div1 r5,r1
+	extu.w r7,r7
+	div1 r5,r1
+	neg r7,r7 ! upper 16 bit of r7 == 0 if r4 == 0x80000000 && r5 == 0x8000
+	div1 r5,r1
+	and r0,r7
+	div1 r5,r1
+	swap.w r7,r7 ! 26 cycles up to here.
+	div1 r5,r1
+	shll8 r0
+	div1 r5,r1
+	exts.w r7,r7
+	div1 r5,r1
+	add r0,r0
+	div1 r5,r1
+	sub r7,r0
+	extu.b r1,r1
+	mov.l @r15+,r7
+	rotcl r1
+	mov.l @r15+,r6
+	add r1,r0
+	mov #-8,r1
+	rts
+	shad r1,r5 ! 34 cycles up to here
+
+	.balign 4
+GLOBAL(udivsi3_i4i):
+	mov.l r6,@-r15
+	extu.w r5,r6
+	cmp/eq r5,r6
+	mov #0x7f,r0
+	bf LOCAL(udiv_ge64k)
+	cmp/hi r0,r5
+	bf LOCAL(udiv_le128)
+	mov r4,r1
+	shlr8 r1
+	div0u
+	shlr r1
+	shll16 r6
+	div1 r6,r1
+	extu.b r4,r0 ! 7 cycles up to here
+	.rept 8
+	div1 r6,r1
+	.endr     ! 15 cycles up to here
+	xor r1,r0 ! xor dividend with result lsb
+	.rept 6
+	div1 r6,r1
+	.endr
+	mov.l r7,@-r15 ! 21 cycles up to here
+	div1 r6,r1
+	extu.b r0,r7
+	div1 r6,r1
+	shll8 r7
+	extu.w r1,r0
+	xor r7,r1 ! replace lsb of result with lsb of dividend
+	div1 r6,r1
+	mov #0,r7
+	div1 r6,r1
+	!
+	div1 r6,r1
+	bra LOCAL(div_end)
+	div1 r6,r1 ! 28 cycles up to here
+
+	/* This is link-compatible with a GLOBAL(sdivsi3) call,
+	   but we effectively clobber only r1, macl and mach  */
+        /* Because negative quotients are calculated as one's complements,
+	   -0x80000000 divided by the smallest positive number of a number
+	   range (0x80, 0x8000, 0x800000) causes saturation in the one's
+           complement representation, and we have to suppress the
+	   one's -> two's complement adjustment.  Since positive numbers
+	   don't get such an adjustment, it's OK to also compute one's -> two's
+	   complement adjustment suppression for a dividend of 0.  */
+	.balign 4
+GLOBAL(sdivsi3_i4i):
+	mov.l r6,@-r15
+	exts.b r5,r6
+	cmp/eq r5,r6
+	mov #-1,r1
+	bt/s LOCAL(div_le128)
+	cmp/pz r4
+	addc r4,r1
+	exts.w r5,r6
+	cmp/eq r5,r6
+	mov #-7,r0
+	bf/s LOCAL(div_ge32k)
+	cmp/hi r1,r4 ! copy sign bit of r4 into T
+	rotcr r1
+	shll16 r6  ! 7 cycles up to here
+	shad r0,r1
+	div0s r5,r4
+	div1 r6,r1
+	mov.l r7,@-r15
+	div1 r6,r1
+	mov r4,r0 ! re-compute adjusted dividend
+	div1 r6,r1
+	mov #-31,r7
+	div1 r6,r1
+	shad r7,r0
+	div1 r6,r1
+	add r4,r0 ! adjusted dividend
+	div1 r6,r1
+	mov.l r8,@-r15
+	div1 r6,r1
+	swap.w r4,r8 ! detect special case r4 = 0x80000000, r5 = 0x80
+	div1 r6,r1
+	swap.b r8,r8
+	xor r1,r0 ! xor dividend with result lsb
+	div1 r6,r1
+	div1 r6,r1
+	or r5,r8
+	div1 r6,r1
+	add #-0x80,r8 ! r8 is 0 iff there is a match
+	div1 r6,r1
+	swap.w r8,r7 ! or upper 16 bits...
+	div1 r6,r1
+	or r7,r8 !...into lower 16 bits
+	div1 r6,r1
+	extu.w r8,r8
+	div1 r6,r1
+	extu.b r0,r7
+	div1 r6,r1
+	shll8 r7
+	exts.w r1,r0
+	xor r7,r1 ! replace lsb of result with lsb of dividend
+	div1 r6,r1
+	neg r8,r8 ! upper 16 bits of r8 are now 0xffff iff we want end adjm.
+	div1 r6,r1
+	and r0,r8
+	div1 r6,r1
+	swap.w r8,r7
+	div1 r6,r1
+	mov.l @r15+,r8 ! 58 insns, 29 cycles up to here
+LOCAL(div_end):
+	div1 r6,r1
+	shll8 r0
+	div1 r6,r1
+	exts.w r7,r7
+	div1 r6,r1
+	add r0,r0
+	div1 r6,r1
+	sub r7,r0
+	extu.b r1,r1
+	mov.l @r15+,r7
+	rotcl r1
+	mov.l @r15+,r6
+	rts
+	add r1,r0
+
+	.balign 4
+LOCAL(udiv_le128): ! 4 cycles up to here (or 7 for mispredict)
+	mova LOCAL(div_table_inv),r0
+	shll2 r6
+	mov.l @(r0,r6),r1
+	mova LOCAL(div_table_clz),r0
+	lds r4,mach
+	!
+	!
+	!
+	tst r1,r1
+	!
+	bt 0f
+	dmulu.l r1,r4
+0:	mov.b @(r0,r5),r1
+	clrt
+	!
+	!
+	sts mach,r0
+	addc r4,r0
+	rotcr r0
+	mov.l @r15+,r6
+	rts
+	shld r1,r0
+
+	.balign 4
+LOCAL(div_le128): ! 3 cycles up to here (or 6 for mispredict)
+	mova LOCAL(div_table_inv),r0
+	shll2 r6
+	mov.l @(r0,r6),r1
+	mova LOCAL(div_table_clz),r0
+	neg r4,r6
+	bf 0f
+	mov r4,r6
+0:	lds r6,mach
+	tst r1,r1
+	bt 0f
+	dmulu.l r1,r6
+0:	div0s r4,r5
+	mov.b @(r0,r5),r1
+	bt/s LOCAL(le128_neg)
+	clrt
+	!
+	sts mach,r0
+	addc r6,r0
+	rotcr r0
+	mov.l @r15+,r6
+	rts
+	shld r1,r0
+
+/* Could trap divide by zero for the cost of one cycle more mispredict penalty:
+...
+	dmulu.l r1,r6
+0:	div0s r4,r5
+	bt/s LOCAL(le128_neg)
+	tst r5,r5
+	bt LOCAL(div_by_zero)
+	mov.b @(r0,r5),r1
+	sts mach,r0
+	addc r6,r0
+...
+LOCAL(div_by_zero):
+	trapa #
+	.balign 4
+LOCAL(le128_neg):
+	bt LOCAL(div_by_zero)
+	mov.b @(r0,r5),r1
+	sts mach,r0
+	addc r6,r0
+...  */
+
+	.balign 4
+LOCAL(le128_neg):
+	sts mach,r0
+	addc r6,r0
+	rotcr r0
+	mov.l @r15+,r6
+	shad r1,r0
+	rts
+	neg r0,r0
+	ENDFUNC(GLOBAL(udivsi3_i4i))
+	ENDFUNC(GLOBAL(sdivsi3_i4i))
+
+/* This table has been generated by divtab-sh4.c.  */
+	.balign 4
+	.byte	-7
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-2
+	.byte	-2
+	.byte	-2
+	.byte	-2
+	.byte	-1
+	.byte	-1
+	.byte	0
+LOCAL(div_table_clz):
+	.byte	0
+	.byte	0
+	.byte	-1
+	.byte	-1
+	.byte	-2
+	.byte	-2
+	.byte	-2
+	.byte	-2
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-3
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-4
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-5
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+	.byte	-6
+/* 1/-128 .. 1/127, normalized.  There is an implicit leading 1 in bit 32,
+   or in bit 33 for powers of two.  */
+	.balign 4
+	.long   0x0
+	.long	0x2040811
+	.long	0x4104105
+	.long	0x624DD30
+	.long	0x8421085
+	.long	0xA6810A7
+	.long	0xC9714FC
+	.long	0xECF56BF
+	.long	0x11111112
+	.long	0x135C8114
+	.long	0x15B1E5F8
+	.long	0x18118119
+	.long	0x1A7B9612
+	.long	0x1CF06ADB
+	.long	0x1F7047DD
+	.long	0x21FB7813
+	.long	0x24924925
+	.long	0x27350B89
+	.long	0x29E4129F
+	.long	0x2C9FB4D9
+	.long	0x2F684BDB
+	.long	0x323E34A3
+	.long	0x3521CFB3
+	.long	0x38138139
+	.long	0x3B13B13C
+	.long	0x3E22CBCF
+	.long	0x41414142
+	.long	0x446F8657
+	.long	0x47AE147B
+	.long	0x4AFD6A06
+	.long	0x4E5E0A73
+	.long	0x51D07EAF
+	.long	0x55555556
+	.long	0x58ED2309
+	.long	0x5C9882BA
+	.long	0x60581606
+	.long	0x642C8591
+	.long	0x68168169
+	.long	0x6C16C16D
+	.long	0x702E05C1
+	.long	0x745D1746
+	.long	0x78A4C818
+	.long	0x7D05F418
+	.long	0x81818182
+	.long	0x86186187
+	.long	0x8ACB90F7
+	.long	0x8F9C18FA
+	.long	0x948B0FCE
+	.long	0x9999999A
+	.long	0x9EC8E952
+	.long	0xA41A41A5
+	.long	0xA98EF607
+	.long	0xAF286BCB
+	.long	0xB4E81B4F
+	.long	0xBACF914D
+	.long	0xC0E07039
+	.long	0xC71C71C8
+	.long	0xCD856891
+	.long	0xD41D41D5
+	.long	0xDAE6076C
+	.long	0xE1E1E1E2
+	.long	0xE9131AC0
+	.long	0xF07C1F08
+	.long	0xF81F81F9
+	.long	0x0
+	.long	0x4104105
+	.long	0x8421085
+	.long	0xC9714FC
+	.long	0x11111112
+	.long	0x15B1E5F8
+	.long	0x1A7B9612
+	.long	0x1F7047DD
+	.long	0x24924925
+	.long	0x29E4129F
+	.long	0x2F684BDB
+	.long	0x3521CFB3
+	.long	0x3B13B13C
+	.long	0x41414142
+	.long	0x47AE147B
+	.long	0x4E5E0A73
+	.long	0x55555556
+	.long	0x5C9882BA
+	.long	0x642C8591
+	.long	0x6C16C16D
+	.long	0x745D1746
+	.long	0x7D05F418
+	.long	0x86186187
+	.long	0x8F9C18FA
+	.long	0x9999999A
+	.long	0xA41A41A5
+	.long	0xAF286BCB
+	.long	0xBACF914D
+	.long	0xC71C71C8
+	.long	0xD41D41D5
+	.long	0xE1E1E1E2
+	.long	0xF07C1F08
+	.long	0x0
+	.long	0x8421085
+	.long	0x11111112
+	.long	0x1A7B9612
+	.long	0x24924925
+	.long	0x2F684BDB
+	.long	0x3B13B13C
+	.long	0x47AE147B
+	.long	0x55555556
+	.long	0x642C8591
+	.long	0x745D1746
+	.long	0x86186187
+	.long	0x9999999A
+	.long	0xAF286BCB
+	.long	0xC71C71C8
+	.long	0xE1E1E1E2
+	.long	0x0
+	.long	0x11111112
+	.long	0x24924925
+	.long	0x3B13B13C
+	.long	0x55555556
+	.long	0x745D1746
+	.long	0x9999999A
+	.long	0xC71C71C8
+	.long	0x0
+	.long	0x24924925
+	.long	0x55555556
+	.long	0x9999999A
+	.long	0x0
+	.long	0x55555556
+	.long	0x0
+	.long	0x0
+LOCAL(div_table_inv):
+	.long	0x0
+	.long	0x0
+	.long	0x0
+	.long	0x55555556
+	.long	0x0
+	.long	0x9999999A
+	.long	0x55555556
+	.long	0x24924925
+	.long	0x0
+	.long	0xC71C71C8
+	.long	0x9999999A
+	.long	0x745D1746
+	.long	0x55555556
+	.long	0x3B13B13C
+	.long	0x24924925
+	.long	0x11111112
+	.long	0x0
+	.long	0xE1E1E1E2
+	.long	0xC71C71C8
+	.long	0xAF286BCB
+	.long	0x9999999A
+	.long	0x86186187
+	.long	0x745D1746
+	.long	0x642C8591
+	.long	0x55555556
+	.long	0x47AE147B
+	.long	0x3B13B13C
+	.long	0x2F684BDB
+	.long	0x24924925
+	.long	0x1A7B9612
+	.long	0x11111112
+	.long	0x8421085
+	.long	0x0
+	.long	0xF07C1F08
+	.long	0xE1E1E1E2
+	.long	0xD41D41D5
+	.long	0xC71C71C8
+	.long	0xBACF914D
+	.long	0xAF286BCB
+	.long	0xA41A41A5
+	.long	0x9999999A
+	.long	0x8F9C18FA
+	.long	0x86186187
+	.long	0x7D05F418
+	.long	0x745D1746
+	.long	0x6C16C16D
+	.long	0x642C8591
+	.long	0x5C9882BA
+	.long	0x55555556
+	.long	0x4E5E0A73
+	.long	0x47AE147B
+	.long	0x41414142
+	.long	0x3B13B13C
+	.long	0x3521CFB3
+	.long	0x2F684BDB
+	.long	0x29E4129F
+	.long	0x24924925
+	.long	0x1F7047DD
+	.long	0x1A7B9612
+	.long	0x15B1E5F8
+	.long	0x11111112
+	.long	0xC9714FC
+	.long	0x8421085
+	.long	0x4104105
+	.long	0x0
+	.long	0xF81F81F9
+	.long	0xF07C1F08
+	.long	0xE9131AC0
+	.long	0xE1E1E1E2
+	.long	0xDAE6076C
+	.long	0xD41D41D5
+	.long	0xCD856891
+	.long	0xC71C71C8
+	.long	0xC0E07039
+	.long	0xBACF914D
+	.long	0xB4E81B4F
+	.long	0xAF286BCB
+	.long	0xA98EF607
+	.long	0xA41A41A5
+	.long	0x9EC8E952
+	.long	0x9999999A
+	.long	0x948B0FCE
+	.long	0x8F9C18FA
+	.long	0x8ACB90F7
+	.long	0x86186187
+	.long	0x81818182
+	.long	0x7D05F418
+	.long	0x78A4C818
+	.long	0x745D1746
+	.long	0x702E05C1
+	.long	0x6C16C16D
+	.long	0x68168169
+	.long	0x642C8591
+	.long	0x60581606
+	.long	0x5C9882BA
+	.long	0x58ED2309
+	.long	0x55555556
+	.long	0x51D07EAF
+	.long	0x4E5E0A73
+	.long	0x4AFD6A06
+	.long	0x47AE147B
+	.long	0x446F8657
+	.long	0x41414142
+	.long	0x3E22CBCF
+	.long	0x3B13B13C
+	.long	0x38138139
+	.long	0x3521CFB3
+	.long	0x323E34A3
+	.long	0x2F684BDB
+	.long	0x2C9FB4D9
+	.long	0x29E4129F
+	.long	0x27350B89
+	.long	0x24924925
+	.long	0x21FB7813
+	.long	0x1F7047DD
+	.long	0x1CF06ADB
+	.long	0x1A7B9612
+	.long	0x18118119
+	.long	0x15B1E5F8
+	.long	0x135C8114
+	.long	0x11111112
+	.long	0xECF56BF
+	.long	0xC9714FC
+	.long	0xA6810A7
+	.long	0x8421085
+	.long	0x624DD30
+	.long	0x4104105
+	.long	0x2040811
+	/* maximum error: 0.987342 scaled: 0.921875*/
+
+#endif /* SH3 / SH4 */
+
+#endif /* L_div_table */
diff --git a/gcc/config/sh/sh-protos.h b/gcc/config/sh/sh-protos.h
index a0661545b56..e142b1cee68 100644
--- a/gcc/config/sh/sh-protos.h
+++ b/gcc/config/sh/sh-protos.h
@@ -1,6 +1,6 @@
 /* Definitions of target machine for GNU compiler for Renesas / SuperH SH.
    Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2003,
-   2004, 2005
+   2004, 2005, 2006
    Free Software Foundation, Inc.
    Contributed by Steve Chamberlain (sac@cygnus.com).
    Improved by Jim Wilson (wilson@cygnus.com).
@@ -69,6 +69,10 @@ extern void print_operand (FILE *, rtx, int);
 extern void output_pic_addr_const (FILE *, rtx);
 extern int expand_block_move (rtx *);
 extern int prepare_move_operands (rtx[], enum machine_mode mode);
+extern enum rtx_code prepare_cbranch_operands (rtx *, enum machine_mode mode,
+					       enum rtx_code comparison);
+extern void expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int);
+extern bool expand_cbranchdi4 (rtx *operands, enum rtx_code comparison);
 extern void from_compare (rtx *, int);
 extern int shift_insns_rtx (rtx);
 extern void gen_ashift (int, int, rtx);
diff --git a/gcc/config/sh/sh.c b/gcc/config/sh/sh.c
index 30b87480412..9f733b852f1 100644
--- a/gcc/config/sh/sh.c
+++ b/gcc/config/sh/sh.c
@@ -526,10 +526,15 @@ sh_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED,
     case OPT_m4:
     case OPT_m4_100:
     case OPT_m4_200:
+    case OPT_m4_300:
       target_flags = (target_flags & ~MASK_ARCH) | SELECT_SH4;
       return true;
 
     case OPT_m4_nofpu:
+    case OPT_m4_100_nofpu:
+    case OPT_m4_200_nofpu:
+    case OPT_m4_300_nofpu:
+    case OPT_m4_340:
     case OPT_m4_400:
     case OPT_m4_500:
       target_flags = (target_flags & ~MASK_ARCH) | SELECT_SH4_NOFPU;
@@ -538,12 +543,14 @@ sh_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED,
     case OPT_m4_single:
     case OPT_m4_100_single:
     case OPT_m4_200_single:
+    case OPT_m4_300_single:
       target_flags = (target_flags & ~MASK_ARCH) | SELECT_SH4_SINGLE;
       return true;
 
     case OPT_m4_single_only:
     case OPT_m4_100_single_only:
     case OPT_m4_200_single_only:
+    case OPT_m4_300_single_only:
       target_flags = (target_flags & ~MASK_ARCH) | SELECT_SH4_SINGLE_ONLY;
       return true;
 
@@ -1341,6 +1348,288 @@ prepare_move_operands (rtx operands[], enum machine_mode mode)
   return 0;
 }
 
+enum rtx_code
+prepare_cbranch_operands (rtx *operands, enum machine_mode mode,
+			  enum rtx_code comparison)
+{
+  rtx op1;
+  rtx scratch = NULL_RTX;
+
+  if (comparison == CODE_FOR_nothing)
+    comparison = GET_CODE (operands[0]);
+  else
+    scratch = operands[4];
+  if (GET_CODE (operands[1]) == CONST_INT
+      && GET_CODE (operands[2]) != CONST_INT)
+    {
+      rtx tmp = operands[1];
+
+      operands[1] = operands[2];
+      operands[2] = tmp;
+      comparison = swap_condition (comparison);
+    }
+  if (GET_CODE (operands[2]) == CONST_INT)
+    {
+      HOST_WIDE_INT val = INTVAL (operands[2]);
+      if ((val == -1 || val == -0x81)
+	  && (comparison == GT || comparison == LE))
+	{
+	  comparison = (comparison == GT) ? GE : LT;
+	  operands[2] = gen_int_mode (val + 1, mode);
+	}
+      else if ((val == 1 || val == 0x80)
+	       && (comparison == GE || comparison == LT))
+	{
+	  comparison = (comparison == GE) ? GT : LE;
+	  operands[2] = gen_int_mode (val - 1, mode);
+	}
+      else if (val == 1 && (comparison == GEU || comparison == LTU))
+	{
+	  comparison = (comparison == GEU) ? NE : EQ;
+	  operands[2] = CONST0_RTX (mode);
+	}
+      else if (val == 0x80 && (comparison == GEU || comparison == LTU))
+	{
+	  comparison = (comparison == GEU) ? GTU : LEU;
+	  operands[2] = gen_int_mode (val - 1, mode);
+	}
+      else if (val == 0 && (comparison == GTU || comparison == LEU))
+	comparison = (comparison == GTU) ? NE : EQ;
+      else if (mode == SImode
+	       && ((val == 0x7fffffff
+		    && (comparison == GTU || comparison == LEU))
+		   || ((unsigned HOST_WIDE_INT) val
+			== (unsigned HOST_WIDE_INT) 0x7fffffff + 1
+		       && (comparison == GEU || comparison == LTU))))
+	{
+	  comparison = (comparison == GTU || comparison == GEU) ? LT : GE;
+	  operands[2] = CONST0_RTX (mode);
+	}
+    }
+  op1 = operands[1];
+  if (!no_new_pseudos)
+    operands[1] = force_reg (mode, op1);
+  /* When we are handling DImode comparisons, we want to keep constants so
+     that we can optimize the component comparisons; however, memory loads
+     are better issued as a whole so that they can be scheduled well.
+     SImode equality comparisons allow I08 constants, but only when they
+     compare r0.  Hence, if operands[1] has to be loaded from somewhere else
+     into a register, that register might as well be r0, and we allow the
+     constant.  If it is already in a register, this is likely to be
+     allocatated to a different hard register, thus we load the constant into
+     a register unless it is zero.  */
+  if (!REG_P (operands[2])
+      && (GET_CODE (operands[2]) != CONST_INT
+	  || (mode == SImode && operands[2] != CONST0_RTX (SImode)
+	      && ((comparison != EQ && comparison != NE)
+		  || (REG_P (op1) && REGNO (op1) != R0_REG)
+		  || !CONST_OK_FOR_I08 (INTVAL (operands[2]))))))
+    {
+      if (scratch && GET_MODE (scratch) == mode)
+	{
+	  emit_move_insn (scratch, operands[2]);
+	  operands[2] = scratch;
+	}
+      else if (!no_new_pseudos)
+	operands[2] = force_reg (mode, operands[2]);
+    }
+  return comparison;
+}
+
+void
+expand_cbranchsi4 (rtx *operands, enum rtx_code comparison, int probability)
+{
+  rtx (*branch_expander) (rtx) = gen_branch_true;
+  rtx jump;
+
+  comparison = prepare_cbranch_operands (operands, SImode, comparison);
+  switch (comparison)
+    {
+    case NE: case LT: case LE: case LTU: case LEU:
+      comparison = reverse_condition (comparison);
+      branch_expander = gen_branch_false;
+    default: ;
+    }
+  emit_insn (gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, T_REG),
+                          gen_rtx_fmt_ee (comparison, SImode,
+                                          operands[1], operands[2])));
+  jump = emit_jump_insn (branch_expander (operands[3]));
+  if (probability >= 0)
+    REG_NOTES (jump)
+      = gen_rtx_EXPR_LIST (REG_BR_PROB, GEN_INT (probability),
+                           REG_NOTES (jump));
+
+}
+
+/* ??? How should we distribute probabilities when more than one branch
+   is generated.  So far we only have soem ad-hoc observations:
+   - If the operands are random, they are likely to differ in both parts.
+   - If comparing items in a hash chain, the operands are random or equal;
+     operation should be EQ or NE.
+   - If items are searched in an ordered tree from the root, we can expect
+     the highpart to be unequal about half of the time; operation should be
+     an unequality comparison, operands non-constant, and overall probability
+     about 50%.  Likewise for quicksort.
+   - Range checks will be often made against constants.  Even if we assume for
+     simplicity an even distribution of the non-constant operand over a
+     sub-range here, the same probability could be generated with differently
+     wide sub-ranges - as long as the ratio of the part of the subrange that
+     is before the threshold to the part that comes after the threshold stays
+     the same.  Thus, we can't really tell anything here;
+     assuming random distribution is at least simple.
+ */
+
+bool
+expand_cbranchdi4 (rtx *operands, enum rtx_code comparison)
+{
+  enum rtx_code msw_taken, msw_skip, lsw_taken;
+  rtx skip_label;
+  rtx op1h, op1l, op2h, op2l;
+  int num_branches;
+  int prob, rev_prob;
+  int msw_taken_prob = -1, msw_skip_prob = -1, lsw_taken_prob = -1;
+
+  comparison = prepare_cbranch_operands (operands, DImode, comparison);
+  op1h = gen_highpart_mode (SImode, DImode, operands[1]);
+  op2h = gen_highpart_mode (SImode, DImode, operands[2]);
+  op1l = gen_lowpart (SImode, operands[1]);
+  op2l = gen_lowpart (SImode, operands[2]);
+  msw_taken = msw_skip = lsw_taken = CODE_FOR_nothing;
+  prob = split_branch_probability;
+  rev_prob = REG_BR_PROB_BASE - prob;
+  switch (comparison)
+    {
+    /* ??? Should we use the cmpeqdi_t pattern for equality comparisons?
+       That costs 1 cycle more when the first branch can be predicted taken,
+       but saves us mispredicts because only one branch needs prediction.
+       It also enables generating the cmpeqdi_t-1 pattern.  */
+    case EQ:
+      if (TARGET_CMPEQDI_T)
+	{
+	  emit_insn (gen_cmpeqdi_t (operands[1], operands[2]));
+	  emit_jump_insn (gen_branch_true (operands[3]));
+	  return true;
+	}
+      msw_skip = NE;
+      lsw_taken = EQ;
+      if (prob >= 0)
+	{
+	  /* If we had more precision, we'd use rev_prob - (rev_prob >> 32) .
+	   */
+	  msw_skip_prob = rev_prob;
+	  if (REG_BR_PROB_BASE <= 65535)
+	    lsw_taken_prob = prob ? REG_BR_PROB_BASE : 0;
+	  else
+	    {
+	      gcc_assert (HOST_BITS_PER_WIDEST_INT >= 64);
+	      lsw_taken_prob
+		= (prob
+		   ? (REG_BR_PROB_BASE
+		      - ((HOST_WIDEST_INT) REG_BR_PROB_BASE * rev_prob
+			 / ((HOST_WIDEST_INT) prob << 32)))
+		   : 0);
+	    }
+	}
+      break;
+    case NE:
+      if (TARGET_CMPEQDI_T)
+	{
+	  emit_insn (gen_cmpeqdi_t (operands[1], operands[2]));
+	  emit_jump_insn (gen_branch_false (operands[3]));
+	  return true;
+	}
+      msw_taken = NE;
+      lsw_taken_prob = prob;
+      lsw_taken = NE;
+      lsw_taken_prob = 0;
+      break;
+    case GTU: case GT:
+      msw_taken = comparison;
+      if (GET_CODE (op2l) == CONST_INT && INTVAL (op2l) == -1)
+	break;
+      if (comparison != GTU || op2h != CONST0_RTX (SImode))
+	msw_skip = swap_condition (msw_taken);
+      lsw_taken = GTU;
+      break;
+    case GEU: case GE:
+      if (op2l == CONST0_RTX (SImode))
+	msw_taken = comparison;
+      else
+	{
+	  msw_taken = comparison == GE ? GT : GTU;
+	  msw_skip = swap_condition (msw_taken);
+	  lsw_taken = GEU;
+	}
+      break;
+    case LTU: case LT:
+      msw_taken = comparison;
+      if (op2l == CONST0_RTX (SImode))
+	break;
+      msw_skip = swap_condition (msw_taken);
+      lsw_taken = LTU;
+      break;
+    case LEU: case LE:
+      if (GET_CODE (op2l) == CONST_INT && INTVAL (op2l) == -1)
+	msw_taken = comparison;
+      else
+	{
+	  lsw_taken = LEU;
+	  if (comparison == LE)
+	    msw_taken = LT;
+	  else if (op2h != CONST0_RTX (SImode))
+	    msw_taken = LTU;
+	  else
+	    break;
+	  msw_skip = swap_condition (msw_taken);
+	}
+      break;
+    default: return false;
+    }
+  num_branches = ((msw_taken != CODE_FOR_nothing)
+		  + (msw_skip != CODE_FOR_nothing)
+		  + (lsw_taken != CODE_FOR_nothing));
+  if (comparison != EQ && comparison != NE && num_branches > 1)
+    {
+      if (!CONSTANT_P (operands[2])
+	  && prob >= (int) (REG_BR_PROB_BASE * 3 / 8U)
+	  && prob <= (int) (REG_BR_PROB_BASE * 5 / 8U))
+	{
+	  msw_taken_prob = prob / 2U;
+	  msw_skip_prob
+	    = REG_BR_PROB_BASE * rev_prob / (REG_BR_PROB_BASE + rev_prob);
+	  lsw_taken_prob = prob;
+	}
+      else
+	{
+	  msw_taken_prob = prob;
+	  msw_skip_prob = REG_BR_PROB_BASE;
+	  /* ??? If we have a constant op2h, should we use that when
+	     calculating lsw_taken_prob?  */
+	  lsw_taken_prob = prob;
+	}
+    }
+  operands[1] = op1h;
+  operands[2] = op2h;
+  operands[4] = NULL_RTX;
+  if (msw_taken != CODE_FOR_nothing)
+    expand_cbranchsi4 (operands, msw_taken, msw_taken_prob);
+  if (msw_skip != CODE_FOR_nothing)
+    {
+      rtx taken_label = operands[3];
+
+      operands[3] = skip_label = gen_label_rtx ();
+      expand_cbranchsi4 (operands, msw_skip, msw_skip_prob);
+      operands[3] = taken_label;
+    }
+  operands[1] = op1l;
+  operands[2] = op2l;
+  if (lsw_taken != CODE_FOR_nothing)
+    expand_cbranchsi4 (operands, lsw_taken, lsw_taken_prob);
+  if (msw_skip != CODE_FOR_nothing)
+    emit_label (skip_label);
+  return true;
+}
+
 /* Prepare the operands for an scc instruction; make sure that the
    compare has been done.  */
 rtx
@@ -1723,6 +2012,12 @@ output_branch (int logic, rtx insn, rtx *operands)
     }
 }
 
+/* Output a code sequence for INSN using TEMPLATE with OPERANDS; but before,
+   fill in operands 9 as a label to the successor insn.
+   We try to use jump threading where possible.
+   IF CODE matches the comparison in the IF_THEN_ELSE of a following jump,
+   we assume the jump is taken.  I.e. EQ means follow jmp and bf, NE means
+   follow jmp and bt, if the address is in range.  */
 const char *
 output_branchy_insn (enum rtx_code code, const char *template,
 		     rtx insn, rtx *operands)
@@ -2117,6 +2412,15 @@ sh_rtx_costs (rtx x, int code, int outer_code, int *total)
       else if ((outer_code == AND || outer_code == IOR || outer_code == XOR)
 	       && CONST_OK_FOR_K08 (INTVAL (x)))
         *total = 1;
+      /* prepare_cmp_insn will force costly constants int registers before
+	 the cbrach[sd]i4 pattterns can see them, so preserve potentially
+	 interesting ones not covered by I08 above.  */
+      else if (outer_code == COMPARE
+	       && ((unsigned HOST_WIDE_INT) INTVAL (x)
+		    == (unsigned HOST_WIDE_INT) 0x7fffffff + 1
+		    || INTVAL (x) == 0x7fffffff
+		   || INTVAL (x) == 0x80 || INTVAL (x) == -0x81))
+        *total = 1;
       else
         *total = 8;
       return true;
@@ -2135,6 +2439,11 @@ sh_rtx_costs (rtx x, int code, int outer_code, int *total)
     case CONST_DOUBLE:
       if (TARGET_SHMEDIA)
         *total = COSTS_N_INSNS (4);
+      /* prepare_cmp_insn will force costly constants int registers before
+	 the cbrachdi4 patttern can see them, so preserve potentially
+	 interesting ones.  */
+      else if (outer_code == COMPARE && GET_MODE (x) == DImode)
+        *total = 1;
       else
         *total = 10;
       return true;
@@ -8571,23 +8880,32 @@ sh_adjust_cost (rtx insn, rtx link ATTRIBUTE_UNUSED, rtx dep_insn, int cost)
     }
   else if (REG_NOTE_KIND (link) == 0)
     {
-      enum attr_type dep_type, type;
+      enum attr_type type;
+      rtx dep_set;
 
       if (recog_memoized (insn) < 0
 	  || recog_memoized (dep_insn) < 0)
 	return cost;
 
-      dep_type = get_attr_type (dep_insn);
-      if (dep_type == TYPE_FLOAD || dep_type == TYPE_PCFLOAD)
-	cost--;
-      if ((dep_type == TYPE_LOAD_SI || dep_type == TYPE_PCLOAD_SI)
-	  && (type = get_attr_type (insn)) != TYPE_CALL
-	  && type != TYPE_SFUNC)
-	cost--;
+      dep_set = single_set (dep_insn);
 
+      /* The latency that we specify in the scheduling description refers
+	 to the actual output, not to an auto-increment register; for that,
+	 the latency is one.  */
+      if (dep_set && MEM_P (SET_SRC (dep_set)) && cost > 1)
+	{
+	  rtx set = single_set (insn);
+
+	  if (set
+	      && !reg_mentioned_p (SET_DEST (dep_set), SET_SRC (set))
+	      && (!MEM_P (SET_DEST (set))
+		  || !reg_mentioned_p (SET_DEST (dep_set),
+				       XEXP (SET_DEST (set), 0))))
+	    cost = 1;
+	}
       /* The only input for a call that is timing-critical is the
 	 function's address.  */
-      if (GET_CODE(insn) == CALL_INSN)
+      if (GET_CODE (insn) == CALL_INSN)
 	{
 	  rtx call = PATTERN (insn);
 
@@ -8599,12 +8917,16 @@ sh_adjust_cost (rtx insn, rtx link ATTRIBUTE_UNUSED, rtx dep_insn, int cost)
 		  /* sibcalli_thunk uses a symbol_ref in an unspec.  */
 	      && (GET_CODE (XEXP (XEXP (call, 0), 0)) == UNSPEC
 		  || ! reg_set_p (XEXP (XEXP (call, 0), 0), dep_insn)))
-	    cost = 0;
+	    cost -= TARGET_SH4_300 ? 3 : 6;
 	}
       /* Likewise, the most timing critical input for an sfuncs call
 	 is the function address.  However, sfuncs typically start
 	 using their arguments pretty quickly.
-	 Assume a four cycle delay before they are needed.  */
+	 Assume a four cycle delay for SH4 before they are needed.
+	 Cached ST40-300 calls are quicker, so assume only a one
+	 cycle delay there.
+	 ??? Maybe we should encode the delays till input registers
+	 are needed by sfuncs into the sfunc call insn.  */
       /* All sfunc calls are parallels with at least four components.
 	 Exploit this to avoid unnecessary calls to sfunc_uses_reg.  */
       else if (GET_CODE (PATTERN (insn)) == PARALLEL
@@ -8612,50 +8934,83 @@ sh_adjust_cost (rtx insn, rtx link ATTRIBUTE_UNUSED, rtx dep_insn, int cost)
 	       && (reg = sfunc_uses_reg (insn)))
 	{
 	  if (! reg_set_p (reg, dep_insn))
-	    cost -= 4;
-	}
-      /* When the preceding instruction loads the shift amount of
-	 the following SHAD/SHLD, the latency of the load is increased
-	 by 1 cycle.  */
-      else if (TARGET_SH4
-	       && get_attr_type (insn) == TYPE_DYN_SHIFT
-	       && get_attr_any_int_load (dep_insn) == ANY_INT_LOAD_YES
-	       && reg_overlap_mentioned_p (SET_DEST (PATTERN (dep_insn)),
-					   XEXP (SET_SRC (single_set (insn)),
-						 1)))
-	cost++;
-      /* When an LS group instruction with a latency of less than
-	 3 cycles is followed by a double-precision floating-point
-	 instruction, FIPR, or FTRV, the latency of the first
-	 instruction is increased to 3 cycles.  */
-      else if (cost < 3
-	       && get_attr_insn_class (dep_insn) == INSN_CLASS_LS_GROUP
-	       && get_attr_dfp_comp (insn) == DFP_COMP_YES)
-	cost = 3;
-      /* The lsw register of a double-precision computation is ready one
-	 cycle earlier.  */
-      else if (reload_completed
-	       && get_attr_dfp_comp (dep_insn) == DFP_COMP_YES
-	       && (use_pat = single_set (insn))
-	       && ! regno_use_in (REGNO (SET_DEST (single_set (dep_insn))),
-				  SET_SRC (use_pat)))
-	cost -= 1;
-
-      if (get_attr_any_fp_comp (dep_insn) == ANY_FP_COMP_YES
-	  && get_attr_late_fp_use (insn) == LATE_FP_USE_YES)
-	cost -= 1;
+	    cost -= TARGET_SH4_300 ? 1 : 4;
+	}
+      if (TARGET_HARD_SH4 && !TARGET_SH4_300)
+	{
+	  enum attr_type dep_type = get_attr_type (dep_insn);
+
+	  if (dep_type == TYPE_FLOAD || dep_type == TYPE_PCFLOAD)
+	    cost--;
+	  else if ((dep_type == TYPE_LOAD_SI || dep_type == TYPE_PCLOAD_SI)
+		   && (type = get_attr_type (insn)) != TYPE_CALL
+		   && type != TYPE_SFUNC)
+	    cost--;
+	  /* When the preceding instruction loads the shift amount of
+	     the following SHAD/SHLD, the latency of the load is increased
+	     by 1 cycle.  */
+	  if (get_attr_type (insn) == TYPE_DYN_SHIFT
+	      && get_attr_any_int_load (dep_insn) == ANY_INT_LOAD_YES
+	      && reg_overlap_mentioned_p (SET_DEST (PATTERN (dep_insn)),
+					  XEXP (SET_SRC (single_set (insn)),
+						1)))
+	    cost++;
+	  /* When an LS group instruction with a latency of less than
+	     3 cycles is followed by a double-precision floating-point
+	     instruction, FIPR, or FTRV, the latency of the first
+	     instruction is increased to 3 cycles.  */
+	  else if (cost < 3
+		   && get_attr_insn_class (dep_insn) == INSN_CLASS_LS_GROUP
+		   && get_attr_dfp_comp (insn) == DFP_COMP_YES)
+	    cost = 3;
+	  /* The lsw register of a double-precision computation is ready one
+	     cycle earlier.  */
+	  else if (reload_completed
+		   && get_attr_dfp_comp (dep_insn) == DFP_COMP_YES
+		   && (use_pat = single_set (insn))
+		   && ! regno_use_in (REGNO (SET_DEST (single_set (dep_insn))),
+				      SET_SRC (use_pat)))
+	    cost -= 1;
+
+	  if (get_attr_any_fp_comp (dep_insn) == ANY_FP_COMP_YES
+	      && get_attr_late_fp_use (insn) == LATE_FP_USE_YES)
+	    cost -= 1;
+	}
+      else if (TARGET_SH4_300)
+	{
+	  /* Stores need their input register two cycles later.  */
+	  if (dep_set && cost >= 1
+	      && ((type = get_attr_type (insn)) == TYPE_STORE
+		  || type == TYPE_PSTORE
+		  || type == TYPE_FSTORE || type == TYPE_MAC_MEM))
+	    {
+	      rtx set = single_set (insn);
+
+	      if (!reg_mentioned_p (SET_SRC (set), XEXP (SET_DEST (set), 0))
+		  && rtx_equal_p (SET_SRC (set), SET_DEST (dep_set)))
+		{
+		  cost -= 2;
+		  /* But don't reduce the cost below 1 if the address depends
+		     on a side effect of dep_insn.  */
+		  if (cost < 1
+		      && modified_in_p (XEXP (SET_DEST (set), 0), dep_insn))
+		    cost = 1;
+		}
+	    }
+	}
     }
   /* An anti-dependence penalty of two applies if the first insn is a double
      precision fadd / fsub / fmul.  */
-  else if (REG_NOTE_KIND (link) == REG_DEP_ANTI
+  else if (!TARGET_SH4_300
+	   && REG_NOTE_KIND (link) == REG_DEP_ANTI
 	   && recog_memoized (dep_insn) >= 0
-	   && get_attr_type (dep_insn) == TYPE_DFP_ARITH
+	   && (get_attr_type (dep_insn) == TYPE_DFP_ARITH
+	       || get_attr_type (dep_insn) == TYPE_DFP_MUL)
 	   /* A lot of alleged anti-flow dependences are fake,
 	      so check this one is real.  */
 	   && flow_dependent_p (dep_insn, insn))
     cost = 2;
 
-
   return cost;
 }
 
diff --git a/gcc/config/sh/sh.h b/gcc/config/sh/sh.h
index fc4e1f282a4..1b659c75135 100644
--- a/gcc/config/sh/sh.h
+++ b/gcc/config/sh/sh.h
@@ -274,6 +274,7 @@ do { \
 #endif
 #if SUPPORT_SH2
 #define SUPPORT_SH3 1
+#define SUPPORT_SH2A_NOFPU 1
 #endif
 #if SUPPORT_SH3
 #define SUPPORT_SH4_NOFPU 1
@@ -281,16 +282,17 @@ do { \
 #if SUPPORT_SH4_NOFPU
 #define SUPPORT_SH4A_NOFPU 1
 #define SUPPORT_SH4AL 1
-#define SUPPORT_SH2A_NOFPU 1
 #endif
 
 #if SUPPORT_SH2E
 #define SUPPORT_SH3E 1
+#define SUPPORT_SH2A_SINGLE_ONLY 1
 #endif
 #if SUPPORT_SH3E
 #define SUPPORT_SH4_SINGLE_ONLY 1
+#endif
+#if SUPPORT_SH4_SINGLE_ONLY
 #define SUPPORT_SH4A_SINGLE_ONLY 1
-#define SUPPORT_SH2A_SINGLE_ONLY 1
 #endif
 
 #if SUPPORT_SH4
@@ -469,6 +471,11 @@ do {									\
       target_flags |= MASK_SMALLCODE;					\
       sh_div_str = SH_DIV_STR_FOR_SIZE ;				\
     }									\
+  else									\
+    {									\
+      TARGET_CBRANCHDI4 = 1;						\
+      TARGET_EXPAND_CBRANCHDI4 = 1;					\
+    }									\
   /* We can't meaningfully test TARGET_SHMEDIA here, because -m options	\
      haven't been parsed yet, hence we'd read only the default.	\
      sh_target_reg_class will return NO_REGS if this is not SHMEDIA, so	\
@@ -608,6 +615,7 @@ do {									\
 	      else							\
 		sh_div_strategy = SH_DIV_INV;				\
 	    }								\
+	  TARGET_CBRANCHDI4 = 0;					\
 	}								\
       /* -fprofile-arcs needs a working libgcov .  In unified tree	\
 	 configurations with newlib, this requires to configure with	\
@@ -668,6 +676,9 @@ do {									\
     sh_divsi3_libfunc = "__sdivsi3_1";					\
   else									\
     sh_divsi3_libfunc = "__sdivsi3";					\
+  if (sh_branch_cost == -1)						\
+    sh_branch_cost							\
+      = TARGET_SH5 ? 1 : ! TARGET_SH2 || TARGET_HARD_SH4 ? 2 : 1;	\
   if (TARGET_FMOVD)							\
     reg_class_from_letter['e' - 'a'] = NO_REGS;				\
 									\
@@ -844,7 +855,7 @@ do {									\
   ((GET_MODE_CLASS (TYPE_MODE (TYPE)) == MODE_COMPLEX_INT \
     || GET_MODE_CLASS (TYPE_MODE (TYPE)) == MODE_COMPLEX_FLOAT) \
    ? (unsigned) MIN (BIGGEST_ALIGNMENT, GET_MODE_BITSIZE (TYPE_MODE (TYPE))) \
-   : (unsigned) ALIGN)
+   : (unsigned) DATA_ALIGNMENT(TYPE, ALIGN))
 
 /* Make arrays of chars word-aligned for the same reasons.  */
 #define DATA_ALIGNMENT(TYPE, ALIGN)		\
@@ -2288,6 +2299,7 @@ struct sh_args {
 #define CONSTANT_ADDRESS_P(X)	(GET_CODE (X) == LABEL_REF)
 
 /* Nonzero if the constant value X is a legitimate general operand.  */
+/* can_store_by_pieces constructs VOIDmode CONST_DOUBLEs.  */
 
 #define LEGITIMATE_CONSTANT_P(X) \
   (TARGET_SHMEDIA							\
@@ -2298,7 +2310,7 @@ struct sh_args {
       || TARGET_SHMEDIA64)						\
    : (GET_CODE (X) != CONST_DOUBLE					\
       || GET_MODE (X) == DFmode || GET_MODE (X) == SFmode		\
-      || (TARGET_SH2E && (fp_zero_operand (X) || fp_one_operand (X)))))
+      || GET_MODE (X) == DImode || GET_MODE (X) == VOIDmode))
 
 /* The macros REG_OK_FOR..._P assume that the arg is a REG rtx
    and check its validity for a certain class.
diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md
index d091dfe0eff..a37c58308e3 100644
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -204,7 +204,9 @@
 ;; load_si	Likewise, SImode variant for general register.
 ;; fload	Likewise, but load to fp register.
 ;; store	to memory
+;; fstore	floating point register to memory
 ;; move		general purpose register to register
+;; movi8	8 bit immediate to general purpose register
 ;; mt_group	other sh4 mt instructions
 ;; fmove	register to register, floating point
 ;; smpy		word precision integer multiply
@@ -221,11 +223,15 @@
 ;; sfunc	special function call with known used registers
 ;; call		function call
 ;; fp		floating point
+;; fpscr_toggle	toggle a bit in the fpscr
 ;; fdiv		floating point divide (or square root)
 ;; gp_fpul	move from general purpose register to fpul
 ;; fpul_gp	move from fpul to general purpose register
 ;; mac_gp	move from mac[lh] to general purpose register
-;; dfp_arith, dfp_cmp,dfp_conv
+;; gp_mac	move from general purpose register to mac[lh]
+;; mac_mem	move from mac[lh] to memory
+;; mem_mac	move from memory to mac[lh]
+;; dfp_arith,dfp_mul, fp_cmp,dfp_cmp,dfp_conv
 ;; ftrc_s	fix_truncsfsi2_i4
 ;; dfdiv	double precision floating point divide (or square root)
 ;; cwb		ic_invalidate_line_i
@@ -263,7 +269,7 @@
 ;; nil		no-op move, will be deleted.
 
 (define_attr "type"
- "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fdiv,ftrc_s,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,mem_fpscr,gp_fpscr,cwb,movua,fsrra,fsca,tls_load,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
+ "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,fstore,move,movi8,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fpscr_toggle,fdiv,ftrc_s,dfp_arith,dfp_mul,fp_cmp,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,gp_mac,mac_mem,mem_mac,mem_fpscr,gp_fpscr,cwb,movua,fsrra,fsca,tls_load,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
   (const_string "other"))
 
 ;; We define a new attribute namely "insn_class".We use
@@ -279,12 +285,12 @@
 (define_attr "insn_class"
   "mt_group,ex_group,ls_group,br_group,fe_group,co_group,none"
   (cond [(eq_attr "type" "move,mt_group") (const_string "mt_group")
-         (eq_attr "type" "arith,dyn_shift") (const_string "ex_group")
-	 (eq_attr "type" "fmove,load,pcload,load_si,pcload_si,fload,pcfload,store,gp_fpul,fpul_gp") (const_string "ls_group")
+         (eq_attr "type" "movi8,arith,dyn_shift") (const_string "ex_group")
+	 (eq_attr "type" "fmove,load,pcload,load_si,pcload_si,fload,pcfload,store,fstore,gp_fpul,fpul_gp") (const_string "ls_group")
 	 (eq_attr "type" "cbranch,jump") (const_string "br_group")
-	 (eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_conv,dfdiv")
+	 (eq_attr "type" "fp,fp_cmp,fdiv,ftrc_s,dfp_arith,dfp_mul,dfp_conv,dfdiv")
 	   (const_string "fe_group")
-	 (eq_attr "type" "jump_ind,smpy,dmpy,mac_gp,return,pload,prset,pstore,prget,rte,sfunc,call,dfp_cmp,mem_fpscr,gp_fpscr,cwb") (const_string "co_group")]
+	 (eq_attr "type" "jump_ind,smpy,dmpy,mac_gp,return,pload,prset,pstore,prget,rte,sfunc,call,dfp_cmp,mem_fpscr,gp_fpscr,cwb,gp_mac,mac_mem,mem_mac") (const_string "co_group")]
 	(const_string "none")))
 ;; nil are zero instructions, and arith3 / arith3b are multiple instructions,
 ;; so these do not belong in an insn group, although they are modeled
@@ -494,14 +500,14 @@
 ;; SH4 Double-precision computation with double-precision result -
 ;; the two halves are ready at different times.
 (define_attr "dfp_comp" "yes,no"
-  (cond [(eq_attr "type" "dfp_arith,dfp_conv,dfdiv") (const_string "yes")]
+  (cond [(eq_attr "type" "dfp_arith,dfp_mul,dfp_conv,dfdiv") (const_string "yes")]
 	(const_string "no")))
 
 ;; Insns for which the latency of a preceding fp insn is decreased by one.
 (define_attr "late_fp_use" "yes,no" (const_string "no"))
 ;; And feeding insns for which this relevant.
 (define_attr "any_fp_comp" "yes,no"
-  (cond [(eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_conv,dfdiv")
+  (cond [(eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_mul,dfp_conv,dfdiv")
 	 (const_string "yes")]
 	(const_string "no")))
 
@@ -609,15 +615,37 @@
    [(set_attr "type" "mt_group")])
 
 ;; -------------------------------------------------------------------------
+;; SImode compare and branch
+;; -------------------------------------------------------------------------
+
+(define_expand "cbranchsi4"
+  [(set (pc)
+	(if_then_else (match_operator 0 "comparison_operator"
+			[(match_operand:SI 1 "arith_operand" "")
+			 (match_operand:SI 2 "arith_operand" "")])
+		      (label_ref (match_operand 3 "" ""))
+		      (pc)))
+   (clobber (reg:SI T_REG))]
+  "TARGET_CBRANCHDI4"
+  "expand_cbranchsi4 (operands, CODE_FOR_nothing, -1); DONE;")
+
+;; -------------------------------------------------------------------------
 ;; SImode unsigned integer comparisons
 ;; -------------------------------------------------------------------------
 
-(define_insn "cmpgeusi_t"
+(define_insn_and_split "cmpgeusi_t"
   [(set (reg:SI T_REG)
 	(geu:SI (match_operand:SI 0 "arith_reg_operand" "r")
-		(match_operand:SI 1 "arith_reg_operand" "r")))]
+		(match_operand:SI 1 "arith_reg_or_0_operand" "rN")))]
   "TARGET_SH1"
   "cmp/hs	%1,%0"
+  "&& operands[0] == CONST0_RTX (SImode)"
+  [(pc)]
+  "
+{
+  emit_insn (gen_sett ());
+  DONE;
+}"
    [(set_attr "type" "mt_group")])
 
 (define_insn "cmpgtusi_t"
@@ -647,12 +675,64 @@
 }")
 
 ;; -------------------------------------------------------------------------
-;; DImode signed integer comparisons
+;; DImode compare and branch
 ;; -------------------------------------------------------------------------
 
-;; ??? Could get better scheduling by splitting the initial test from the
-;; rest of the insn after reload.  However, the gain would hardly justify
-;; the sh.md size increase necessary to do that.
+
+;; arith3 patterns don't work well with the sh4-300 branch prediction mechanism.
+;; Therefore, we aim to have a set of three branches that go straight to the
+;; destination, i.e. only one of them is taken at any one time.
+;; This mechanism should also be slightly better for the sh4-200.
+
+(define_expand "cbranchdi4"
+  [(set (pc)
+	(if_then_else (match_operator 0 "comparison_operator"
+			[(match_operand:DI 1 "arith_operand" "")
+			 (match_operand:DI 2 "arith_operand" "")])
+		      (label_ref (match_operand 3 "" ""))
+		      (pc)))
+   (clobber (match_dup 4))
+   (clobber (reg:SI T_REG))]
+  "TARGET_CBRANCHDI4"
+  "
+{
+  enum rtx_code comparison;
+
+  if (TARGET_EXPAND_CBRANCHDI4)
+    {
+      if (expand_cbranchdi4 (operands, CODE_FOR_nothing))
+	DONE;
+    }
+  comparison = prepare_cbranch_operands (operands, DImode, CODE_FOR_nothing);
+  if (comparison != GET_CODE (operands[0]))
+    operands[0]
+      = gen_rtx_fmt_ee (VOIDmode, comparison, operands[1], operands[2]);
+   operands[4] = gen_rtx_SCRATCH (SImode);
+}")
+
+(define_insn_and_split "cbranchdi4_i"
+  [(set (pc)
+	(if_then_else (match_operator 0 "comparison_operator"
+			[(match_operand:DI 1 "arith_operand" "r,r")
+			 (match_operand:DI 2 "arith_operand" "rN,i")])
+		      (label_ref (match_operand 3 "" ""))
+		      (pc)))
+   (clobber (match_scratch:SI 4 "=X,&r"))
+   (clobber (reg:SI T_REG))]
+  "TARGET_CBRANCHDI4"
+  "#"
+  "&& reload_completed"
+  [(pc)]
+  "
+{
+  if (!expand_cbranchdi4 (operands, GET_CODE (operands[0])))
+    FAIL;
+  DONE;
+}")
+
+;; -------------------------------------------------------------------------
+;; DImode signed integer comparisons
+;; -------------------------------------------------------------------------
 
 (define_insn ""
   [(set (reg:SI T_REG)
@@ -4736,7 +4816,7 @@ label:
   [(set (mem:SF (pre_dec:SI (reg:SI SP_REG))) (reg:SF FPUL_REG))]
   "TARGET_SH2E && ! TARGET_SH5"
   "sts.l	fpul,@-r15"
-  [(set_attr "type" "store")
+  [(set_attr "type" "fstore")
    (set_attr "late_fp_use" "yes")
    (set_attr "hit_stack" "yes")])
 
@@ -4818,9 +4898,9 @@ label:
 ;; (made from (set (subreg:SI (reg:QI ###) 0) ) into T.
 (define_insn "movsi_i"
   [(set (match_operand:SI 0 "general_movdst_operand"
-	    "=r,r,t,r,r,r,r,m,<,<,x,l,x,l,r")
+	    "=r,r,r,t,r,r,r,r,m,<,<,x,l,x,l,r")
 	(match_operand:SI 1 "general_movsrc_operand"
-	 "Q,rI08,r,mr,x,l,t,r,x,l,r,r,>,>,i"))]
+	 "Q,r,I08,r,mr,x,l,t,r,x,l,r,r,>,>,i"))]
   "TARGET_SH1
    && ! TARGET_SH2E
    && ! TARGET_SH2A
@@ -4829,6 +4909,7 @@ label:
   "@
 	mov.l	%1,%0
 	mov	%1,%0
+	mov	%1,%0
 	cmp/pl	%1
 	mov.l	%1,%0
 	sts	%1,%0
@@ -4842,8 +4923,8 @@ label:
 	lds.l	%1,%0
 	lds.l	%1,%0
 	fake	%1,%0"
-  [(set_attr "type" "pcload_si,move,mt_group,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,pcload_si")
-   (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")])
+  [(set_attr "type" "pcload_si,move,movi8,mt_group,load_si,mac_gp,prget,arith,mac_mem,store,pstore,gp_mac,prset,mem_mac,pload,pcload_si")
+   (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")])
 
 ;; t/r must come after r/r, lest reload will try to reload stuff like
 ;; (subreg:SI (reg:SF FR14_REG) 0) into T (compiling stdlib/strtod.c -m3e -O2)
@@ -4853,15 +4934,16 @@ label:
 ;; TARGET_FMOVD is in effect, and mode switching is done before reload.
 (define_insn "movsi_ie"
   [(set (match_operand:SI 0 "general_movdst_operand"
-	    "=r,r,r,t,r,r,r,r,m,<,<,x,l,x,l,y,<,r,y,r,*f,y,*f,y")
+	    "=r,r,r,r,t,r,r,r,r,m,<,<,x,l,x,l,y,<,r,y,r,*f,y,*f,y")
 	(match_operand:SI 1 "general_movsrc_operand"
-	 "Q,rI08,I20,r,mr,x,l,t,r,x,l,r,r,>,>,>,y,i,r,y,y,*f,*f,y"))]
+	 "Q,r,I08,I20,r,mr,x,l,t,r,x,l,r,r,>,>,>,y,i,r,y,y,*f,*f,y"))]
   "(TARGET_SH2E || TARGET_SH2A)
    && (register_operand (operands[0], SImode)
        || register_operand (operands[1], SImode))"
   "@
 	mov.l	%1,%0
 	mov	%1,%0
+	mov	%1,%0
 	movi20	%1,%0
 	cmp/pl	%1
 	mov.l	%1,%0
@@ -4884,26 +4966,27 @@ label:
 	flds	%1,fpul
 	fmov	%1,%0
 	! move optimized away"
-  [(set_attr "type" "pcload_si,move,move,*,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,load,store,pcload_si,gp_fpul,fpul_gp,fmove,fmove,fmove,nil")
-   (set_attr "late_fp_use" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes,*,*,yes,*,*,*,*")
-   (set_attr "length" "*,*,4,*,4,*,*,*,4,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")])
+  [(set_attr "type" "pcload_si,move,movi8,move,*,load_si,mac_gp,prget,arith,store,mac_mem,pstore,gp_mac,prset,mem_mac,pload,load,fstore,pcload_si,gp_fpul,fpul_gp,fmove,fmove,fmove,nil")
+   (set_attr "late_fp_use" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes,*,*,yes,*,*,*,*")
+   (set_attr "length" "*,*,*,4,*,4,*,*,*,4,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")])
 
 (define_insn "movsi_i_lowpart"
-  [(set (strict_low_part (match_operand:SI 0 "general_movdst_operand" "+r,r,r,r,r,r,m,r"))
-	(match_operand:SI 1 "general_movsrc_operand" "Q,rI08,mr,x,l,t,r,i"))]
+  [(set (strict_low_part (match_operand:SI 0 "general_movdst_operand" "+r,r,r,r,r,r,r,m,r"))
+	(match_operand:SI 1 "general_movsrc_operand" "Q,r,I08,mr,x,l,t,r,i"))]
    "TARGET_SH1
     && (register_operand (operands[0], SImode)
         || register_operand (operands[1], SImode))"
   "@
 	mov.l	%1,%0
 	mov	%1,%0
+	mov	%1,%0
 	mov.l	%1,%0
 	sts	%1,%0
 	sts	%1,%0
 	movt	%0
 	mov.l	%1,%0
 	fake	%1,%0"
-  [(set_attr "type" "pcload,move,load,move,prget,move,store,pcload")])
+  [(set_attr "type" "pcload,move,arith,load,mac_gp,prget,arith,store,pcload")])
 
 (define_insn_and_split "load_ra"
   [(set (match_operand:SI 0 "general_movdst_operand" "")
@@ -5155,19 +5238,20 @@ label:
    (set_attr "needs_delay_slot" "yes")])
 
 (define_insn "movqi_i"
-  [(set (match_operand:QI 0 "general_movdst_operand" "=r,r,m,r,r,l")
-	(match_operand:QI 1 "general_movsrc_operand"  "ri,m,r,t,l,r"))]
+  [(set (match_operand:QI 0 "general_movdst_operand" "=r,r,r,m,r,r,l")
+	(match_operand:QI 1 "general_movsrc_operand"  "r,i,m,r,t,l,r"))]
   "TARGET_SH1
    && (arith_reg_operand (operands[0], QImode)
        || arith_reg_operand (operands[1], QImode))"
   "@
 	mov	%1,%0
+	mov	%1,%0
 	mov.b	%1,%0
 	mov.b	%1,%0
 	movt	%0
 	sts	%1,%0
 	lds	%1,%0"
- [(set_attr "type" "move,load,store,move,move,move")])
+ [(set_attr "type" "move,movi8,load,store,arith,prget,prset")])
 
 (define_insn "*movqi_media"
   [(set (match_operand:QI 0 "general_movdst_operand" "=r,r,r,m")
@@ -5769,7 +5853,7 @@ label:
       (if_then_else
        (ne (symbol_ref "TARGET_SHCOMPACT") (const_int 0))
        (const_int 10) (const_int 8))])
-   (set_attr "type" "fmove,move,pcfload,fload,store,pcload,load,store,load,fload")
+   (set_attr "type" "fmove,move,pcfload,fload,fstore,pcload,load,store,load,fload")
    (set_attr "late_fp_use" "*,*,*,*,yes,*,*,*,*,*")
    (set (attr "fp_mode") (if_then_else (eq_attr "fmovd" "yes")
 					   (const_string "double")
@@ -6486,7 +6570,7 @@ label:
 	sts.l	%1,%0
 	lds.l	%1,%0
 	! move optimized away"
-  [(set_attr "type" "fmove,move,fmove,fmove,pcfload,fload,store,pcload,load,store,fmove,fmove,load,*,fpul_gp,gp_fpul,store,load,nil")
+  [(set_attr "type" "fmove,move,fmove,fmove,pcfload,fload,fstore,pcload,load,store,fmove,fmove,load,*,fpul_gp,gp_fpul,fstore,load,nil")
    (set_attr "late_fp_use" "*,*,*,*,*,*,yes,*,*,*,*,*,*,*,yes,*,yes,*,*")
    (set_attr "length" "*,*,*,*,4,4,4,*,*,*,2,2,2,4,2,2,2,2,0")
    (set (attr "fp_mode") (if_then_else (eq_attr "fmovd" "yes")
@@ -9929,7 +10013,7 @@ mov.l\\t1f,r0\\n\\
 	sts	fpscr,%0
 	sts.l	fpscr,%0"
   [(set_attr "length" "0,2,2,4,2,2,2,2,2")
-   (set_attr "type" "nil,mem_fpscr,load,mem_fpscr,gp_fpscr,move,store,mac_gp,store")])
+   (set_attr "type" "nil,mem_fpscr,load,mem_fpscr,gp_fpscr,move,store,mac_gp,fstore")])
 
 (define_peephole2
   [(set (reg:PSI FPSCR_REG)
@@ -9980,7 +10064,7 @@ mov.l\\t1f,r0\\n\\
 	(xor:PSI (reg:PSI FPSCR_REG) (const_int 1048576)))]
   "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
   "fschg"
-  [(set_attr "type" "fp") (set_attr "fp_set" "unknown")])
+  [(set_attr "type" "fpscr_toggle") (set_attr "fp_set" "unknown")])
 
 ;; There's no way we can use it today, since optimize mode switching
 ;; doesn't enable us to know from which mode we're switching to the
@@ -9992,7 +10076,7 @@ mov.l\\t1f,r0\\n\\
 	(xor:PSI (reg:PSI FPSCR_REG) (const_int 524288)))]
   "TARGET_SH4A_FP && ! TARGET_FPU_SINGLE"
   "fpchg"
-  [(set_attr "type" "fp")])
+  [(set_attr "type" "fpscr_toggle")])
 
 (define_expand "addsf3"
   [(set (match_operand:SF 0 "arith_reg_operand" "")
@@ -10124,25 +10208,12 @@ mov.l\\t1f,r0\\n\\
   [(set_attr "type" "fp")
    (set_attr "fp_mode" "single")])
 
-;; Unfortunately, the combiner is unable to cope with the USE of the FPSCR
-;; register in feeding fp instructions.  Thus, we cannot generate fmac for
-;; mixed-precision SH4 targets.  To allow it to be still generated for the
-;; SH3E, we use a separate insn for SH3E mulsf3.
-
 (define_expand "mulsf3"
   [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
 	(mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "")
 		 (match_operand:SF 2 "fp_arith_reg_operand" "")))]
   "TARGET_SH2E || TARGET_SHMEDIA_FPU"
-  "
-{
-  if (TARGET_SH4 || TARGET_SH2A_SINGLE)
-    expand_sf_binop (&gen_mulsf3_i4, operands);
-  else if (TARGET_SH2E)
-    emit_insn (gen_mulsf3_ie (operands[0], operands[1], operands[2]));
-  if (! TARGET_SHMEDIA)
-    DONE;
-}")
+  "")
 
 (define_insn "*mulsf3_media"
   [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
@@ -10152,6 +10223,27 @@ mov.l\\t1f,r0\\n\\
   "fmul.s	%1, %2, %0"
   [(set_attr "type" "fparith_media")])
 
+;; Unfortunately, the combiner is unable to cope with the USE of the FPSCR
+;; register in feeding fp instructions.  Thus, in order to generate fmac,
+;; we start out with a mulsf pattern that does not depend on fpscr.
+;; This is split after combine to introduce the dependency, in order to
+;; get mode switching and scheduling right.
+(define_insn_and_split "mulsf3_ie"
+  [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
+	(mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%0")
+		 (match_operand:SF 2 "fp_arith_reg_operand" "f")))]
+  "TARGET_SH2E"
+  "fmul	%2,%0"
+  "TARGET_SH4 || TARGET_SH2A_SINGLE"
+  [(const_int 0)]
+  "
+{
+  emit_insn (gen_mulsf3_i4 (operands[0], operands[1], operands[2],
+	     get_fpscr_rtx ()));
+  DONE;
+}"
+  [(set_attr "type" "fp")])
+
 (define_insn "mulsf3_i4"
   [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
 	(mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%0")
@@ -10162,20 +10254,12 @@ mov.l\\t1f,r0\\n\\
   [(set_attr "type" "fp")
    (set_attr "fp_mode" "single")])
 
-(define_insn "mulsf3_ie"
-  [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
-	(mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%0")
-		 (match_operand:SF 2 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)"
-  "fmul	%2,%0"
-  [(set_attr "type" "fp")])
-
 (define_insn "mac_media"
   [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
 	(plus:SF (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%f")
 			  (match_operand:SF 2 "fp_arith_reg_operand" "f"))
 		 (match_operand:SF 3 "fp_arith_reg_operand" "0")))]
-  "TARGET_SHMEDIA_FPU"
+  "TARGET_SHMEDIA_FPU && TARGET_FMAC"
   "fmac.s %1, %2, %0"
   [(set_attr "type" "fparith_media")])
 
@@ -10185,7 +10269,7 @@ mov.l\\t1f,r0\\n\\
 			  (match_operand:SF 2 "fp_arith_reg_operand" "f"))
 		 (match_operand:SF 3 "arith_reg_operand" "0")))
    (use (match_operand:PSI 4 "fpscr_operand" "c"))]
-  "TARGET_SH2E && ! TARGET_SH4"
+  "TARGET_SH2E && TARGET_FMAC"
   "fmac	fr0,%2,%0"
   [(set_attr "type" "fp")
    (set_attr "fp_mode" "single")])
@@ -10336,7 +10420,7 @@ mov.l\\t1f,r0\\n\\
 	       (match_operand:SF 1 "fp_arith_reg_operand" "f")))]
   "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)"
   "fcmp/gt	%1,%0"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fp_cmp")
    (set_attr "fp_mode" "single")])
 
 (define_insn "cmpeqsf_t"
@@ -10345,7 +10429,7 @@ mov.l\\t1f,r0\\n\\
 	       (match_operand:SF 1 "fp_arith_reg_operand" "f")))]
   "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)"
   "fcmp/eq	%1,%0"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fp_cmp")
    (set_attr "fp_mode" "single")])
 
 (define_insn "ieee_ccmpeqsf_t"
@@ -10365,7 +10449,7 @@ mov.l\\t1f,r0\\n\\
    (use (match_operand:PSI 2 "fpscr_operand" "c"))]
   "(TARGET_SH4 || TARGET_SH2A_SINGLE)"
   "fcmp/gt	%1,%0"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fp_cmp")
    (set_attr "fp_mode" "single")])
 
 (define_insn "cmpeqsf_t_i4"
@@ -10375,7 +10459,7 @@ mov.l\\t1f,r0\\n\\
    (use (match_operand:PSI 2 "fpscr_operand" "c"))]
   "(TARGET_SH4 || TARGET_SH2A_SINGLE)"
   "fcmp/eq	%1,%0"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "fp_cmp")
    (set_attr "fp_mode" "single")])
 
 (define_insn "*ieee_ccmpeqsf_t_4"
@@ -10724,7 +10808,7 @@ mov.l\\t1f,r0\\n\\
    (use (match_operand:PSI 3 "fpscr_operand" "c"))]
   "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
   "fmul	%2,%0"
-  [(set_attr "type" "dfp_arith")
+  [(set_attr "type" "dfp_mul")
    (set_attr "fp_mode" "double")])
 
 (define_expand "divdf3"
diff --git a/gcc/config/sh/sh.opt b/gcc/config/sh/sh.opt
index 7f9a87e95d9..161fdd8dcaf 100644
--- a/gcc/config/sh/sh.opt
+++ b/gcc/config/sh/sh.opt
@@ -57,11 +57,11 @@ Target RejectNegative Condition(SUPPORT_SH2A_NOFPU)
 Generate SH2a FPU-less code
 
 m2a-single
-Target RejectNegative Condition (SUPPORT_SH2A_SINGLE)
+Target RejectNegative Condition(SUPPORT_SH2A_SINGLE)
 Generate default single-precision SH2a code
 
 m2a-single-only
-Target RejectNegative Condition (SUPPORT_SH2A_SINGLE_ONLY)
+Target RejectNegative Condition(SUPPORT_SH2A_SINGLE_ONLY)
 Generate only single-precision SH2a code
 
 m2e
@@ -88,10 +88,33 @@ m4-200
 Target RejectNegative Condition(SUPPORT_SH4)
 Generate SH4-200 code
 
+;; TARGET_SH4_300 indicates if we have the ST40-300 instruction set and
+;; pipeline - irrespective of ABI.
+m4-300
+Target RejectNegative Condition(SUPPORT_SH4) Var(TARGET_SH4_300)
+Generate SH4-300 code
+
 m4-nofpu
 Target RejectNegative Condition(SUPPORT_SH4_NOFPU)
 Generate SH4 FPU-less code
 
+m4-100-nofpu
+Target RejectNegative Condition(SUPPORT_SH4_NOFPU)
+Generate SH4-100 FPU-less code
+
+m4-200-nofpu
+Target RejectNegative Condition(SUPPORT_SH4_NOFPU)
+Generate SH4-200 FPU-less code
+
+m4-300-nofpu
+Target RejectNegative Condition(SUPPORT_SH4_NOFPU) Var(TARGET_SH4_300) VarExists
+Generate SH4-300 FPU-less code
+
+m4-340
+Target RejectNegative Condition(SUPPORT_SH4_NOFPU) Var(TARGET_SH4_300) VarExists
+Generate code for SH4 340 series (MMU/FPU-less)
+;; passes -isa=sh4-nommu-nofpu to the assembler.
+
 m4-400
 Target RejectNegative Condition(SUPPORT_SH4_NOFPU)
 Generate code for SH4 400 series (MMU/FPU-less)
@@ -114,6 +137,10 @@ m4-200-single
 Target RejectNegative Condition(SUPPORT_SH4_SINGLE)
 Generate default single-precision SH4-200 code
 
+m4-300-single
+Target RejectNegative Condition(SUPPORT_SH4_SINGLE) Var(TARGET_SH4_300) VarExists
+Generate default single-precision SH4-300 code
+
 m4-single-only
 Target RejectNegative Condition(SUPPORT_SH4_SINGLE_ONLY)
 Generate only single-precision SH4 code
@@ -126,6 +153,10 @@ m4-200-single-only
 Target RejectNegative Condition(SUPPORT_SH4_SINGLE_ONLY)
 Generate only single-precision SH4-200 code
 
+m4-300-single-only
+Target RejectNegative Condition(SUPPORT_SH4_SINGLE_ONLY) Var(TARGET_SH4_300) VarExists
+Generate only single-precision SH4-300 code
+
 m4a
 Target RejectNegative Mask(SH4A) Condition(SUPPORT_SH4A)
 Generate SH4a code
@@ -182,6 +213,22 @@ mbigtable
 Target Report RejectNegative Mask(BIGTABLE)
 Generate 32-bit offsets in switch tables
 
+mbranch-cost=
+Target RejectNegative Joined UInteger Var(sh_branch_cost) Init(-1)
+Cost to assume for a branch insn
+
+mcbranchdi
+Target Var(TARGET_CBRANCHDI4)
+Enable cbranchdi4 pattern
+
+mexpand-cbranchdi
+Target Var(TARGET_EXPAND_CBRANCHDI4)
+Expand cbranchdi4 pattern early into separate comparisons and branches.
+
+mcmpeqdi
+Target Var(TARGET_CMPEQDI_T)
+Emit cmpeqdi_t pattern even when -mcbranchdi and -mexpand-cbranchdi are in effect.
+
 mcut2-workaround
 Target RejectNegative Var(TARGET_SH5_CUT2_WORKAROUND)
 Enable SH5 cut2 workaround
@@ -192,7 +239,7 @@ Align doubles at 64-bit boundaries
 
 mdiv=
 Target RejectNegative Joined Var(sh_div_str) Init("")
-Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp call-div1 call-fp call-table
+Division strategy, one of: call, call2, fp, inv, inv:minlat, inv20u, inv20l, inv:call, inv:call2, inv:fp, call-div1, call-fp, call-table
 
 mdivsi3_libfunc=
 Target RejectNegative Joined Var(sh_divsi3_libfunc) Init("")
@@ -201,6 +248,10 @@ Specify name for 32 bit signed division function
 mfmovd
 Target RejectNegative Mask(FMOVD) Undocumented
 
+mfused-madd
+Target Var(TARGET_FMAC)
+Enable the use of the fused floating point multiply-accumulate operation
+
 mgettrcost=
 Target RejectNegative Joined UInteger Var(sh_gettrcost) Init(-1)
 Cost to assume for gettr insn
diff --git a/gcc/config/sh/sh1.md b/gcc/config/sh/sh1.md
index 9dfdd86508f..1198fe737b9 100644
--- a/gcc/config/sh/sh1.md
+++ b/gcc/config/sh/sh1.md
@@ -1,5 +1,5 @@
 ;; DFA scheduling description for Renesas / SuperH SH.
-;; Copyright (C) 2004 Free Software Foundation, Inc.
+;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
 
 ;; This file is part of GCC.
 
@@ -45,7 +45,7 @@
 
 (define_insn_reservation "sh1_load_store" 2
   (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "load,pcload,pload,store,pstore"))
+       (eq_attr "type" "load,pcload,pload,mem_mac,store,fstore,pstore,mac_mem"))
   "sh1memory*2")
 
 (define_insn_reservation "sh1_arith3" 3
@@ -76,7 +76,7 @@
 
 (define_insn_reservation "sh1_fp" 2
   (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "fp,fmove"))
+       (eq_attr "type" "fp,fpscr_toggle,fp_cmp,fmove"))
   "sh1fp")
 
 (define_insn_reservation "sh1_fdiv" 13
diff --git a/gcc/config/sh/sh4-300.md b/gcc/config/sh/sh4-300.md
new file mode 100644
index 00000000000..228782a67fc
--- /dev/null
+++ b/gcc/config/sh/sh4-300.md
@@ -0,0 +1,288 @@
+;; DFA scheduling description for ST40-300.
+;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING.  If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+;; Load and store instructions save a cycle if they are aligned on a
+;; four byte boundary.  Using a function unit for stores encourages
+;; gcc to separate load and store instructions by one instruction,
+;; which makes it more likely that the linker will be able to word
+;; align them when relaxing.
+
+;; The following description models the ST40-300 pipeline using the DFA based
+;; scheduler.
+
+;; Two automata are defined to reduce number of states
+;; which a single large automaton will have. (Factoring)
+
+(define_automaton "sh4_300_inst_pipeline,sh4_300_fpu_pipe")
+
+;; This unit is basically the decode unit of the processor.
+;; Since SH4 is a dual issue machine,it is as if there are two
+;; units so that any insn can be processed by either one
+;; of the decoding unit.
+
+(define_cpu_unit "sh4_300_pipe_01,sh4_300_pipe_02" "sh4_300_inst_pipeline")
+
+;; The floating point units.
+
+(define_cpu_unit "sh4_300_fpt,sh4_300_fpu,sh4_300_fds" "sh4_300_fpu_pipe")
+
+;; integer multiplier unit
+
+(define_cpu_unit "sh4_300_mul" "sh4_300_inst_pipeline")
+
+;; LS unit
+
+(define_cpu_unit "sh4_300_ls" "sh4_300_inst_pipeline")
+
+;; The address calculator used for branch instructions.
+;; This will be reserved after "issue" of branch instructions
+;; and this is to make sure that no two branch instructions
+;; can be issued in parallel.
+
+(define_cpu_unit "sh4_300_br" "sh4_300_inst_pipeline")
+
+;; ----------------------------------------------------
+;; This reservation is to simplify the dual issue description.
+
+(define_reservation  "sh4_300_issue"  "sh4_300_pipe_01|sh4_300_pipe_02")
+
+(define_reservation "all" "sh4_300_pipe_01+sh4_300_pipe_02")
+
+;;(define_insn_reservation "nil" 0 (eq_attr "type" "nil") "nothing")
+
+;; MOV RM,RN / MOV #imm8,RN / STS PR,RN
+(define_insn_reservation "sh4_300_mov" 0
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "move,movi8,prget"))
+  "sh4_300_issue")
+
+;; Fixed STS from MACL / MACH
+(define_insn_reservation "sh4_300_mac_gp" 0
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "mac_gp"))
+  "sh4_300_issue+sh4_300_mul")
+
+;; Fixed LDS to MACL / MACH
+(define_insn_reservation "sh4_300_gp_mac" 1
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "gp_mac"))
+  "sh4_300_issue+sh4_300_mul")
+
+;; Instructions without specific resource requirements with latency 1.
+
+(define_insn_reservation "sh4_300_simple_arith" 1
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "mt_group,arith,dyn_shift,prset"))
+  "sh4_300_issue")
+
+;; Load and store instructions have no alignment peculiarities for the ST40-300,
+;; but they use the load-store unit, which they share with the fmove type
+;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) .
+;; Loads have a latency of three.
+
+;; Load Store instructions.
+(define_insn_reservation "sh4_300_load" 3
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "load,pcload,load_si,pcload_si,pload"))
+  "sh4_300_issue+sh4_300_ls")
+
+(define_insn_reservation "sh4_300_mac_load" 3
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "mem_mac"))
+  "sh4_300_issue+sh4_300_ls+sh4_300_mul")
+
+(define_insn_reservation "sh4_300_fload" 4
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "fload,pcfload"))
+  "sh4_300_issue+sh4_300_ls+sh4_300_fpt")
+
+;; sh_adjust_cost describes the reduced latency of the feeding insns of a store.
+;; The latency of an auto-increment register is 1; the latency of the memory
+;; output is not actually considered here anyway.
+(define_insn_reservation "sh4_300_store" 1
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "store,pstore"))
+  "sh4_300_issue+sh4_300_ls")
+
+(define_insn_reservation "sh4_300_fstore" 1
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "fstore"))
+  "sh4_300_issue+sh4_300_ls+sh4_300_fpt")
+
+;; Fixed STS.L from MACL / MACH
+(define_insn_reservation "sh4_300_mac_store" 1
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "mac_mem"))
+  "sh4_300_issue+sh4_300_mul+sh4_300_ls")
+
+(define_insn_reservation "sh4_300_gp_fpul" 2
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "gp_fpul"))
+  "sh4_300_issue+sh4_300_fpt")
+
+(define_insn_reservation "sh4_300_fpul_gp" 1
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "fpul_gp"))
+  "sh4_300_issue+sh4_300_fpt")
+
+;; Branch (BF,BF/S,BT,BT/S,BRA)
+;; Branch Far (JMP,RTS,BRAF)
+;; Group:	BR
+;; When displacement is 0 for BF / BT, we have effectively conditional
+;; execution of one instruction, without pipeline disruption.
+;; Otherwise, the latency depends on prediction success.
+;; We can't really do much with the latency, even if we could express it,
+;; but the pairing restrictions are useful to take into account.
+;; ??? If the branch is likely, and not paired with a preceding insn,
+;; or likely and likely not predicted, we might want to fill the delay slot.
+;; However, there appears to be no machinery to make the compiler
+;; recognize these scenarios.
+
+(define_insn_reservation "sh4_300_branch"  1
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "cbranch,jump,return,jump_ind"))
+  "sh4_300_issue+sh4_300_br")
+
+;; RTE
+(define_insn_reservation "sh4_300_return_from_exp" 9
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "rte"))
+  "sh4_300_pipe_01+sh4_300_pipe_02*9")
+
+;; OCBP, OCBWB
+;; Group:	CO
+;; Latency: 	1-5
+;; Issue Rate: 	1
+
+;; cwb is used for the sequence ocbwb @%0; extu.w %0,%2; or %1,%2; mov.l %0,@%2
+;; This description is likely inexact, but this pattern should not actually
+;; appear when compiling for sh4-300; we should use isbi instead.
+;; If a -mtune option is added later, we should use the icache array
+;; dispatch method instead.
+(define_insn_reservation "sh4_300_ocbwb"  3
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "cwb"))
+  "all*3")
+
+;; JSR,BSR,BSRF
+;; Calls have a mandatory delay slot, which we'd like to fill with an insn
+;; that can be paired with the call itself.
+;; Scheduling runs before reorg, so we approximate this by saying that we
+;; want the call to be paired with a preceding insn.
+;; In most cases, the insn that loads the address of the call should have
+;; a non-zero latency (mov rn,rm doesn't make sense since we could use rn
+;; for the address then).  Thus, a preceding insn that can be paired with
+;; a call should be elegible for the delay slot.
+;;
+;; calls introduce a longisch delay that is likely to flush the pipelines
+;; of the caller's instructions.  Ordinary functions tend to end with a
+;; load to restore a register (in the delay slot of rts), while sfuncs
+;; tend to end with an EX or MT insn.  But that is not actually relevant,
+;; since there are no instructions that contend for memory access early.
+;; We could, of course, provide exact scheduling information for specific
+;; sfuncs, if that should prove useful.
+
+(define_insn_reservation "sh4_300_call" 16
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "call,sfunc"))
+  "sh4_300_issue+sh4_300_br,all*15")
+
+;; FMOV.S / FMOV.D
+(define_insn_reservation "sh4_300_fmov" 1
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "fmove"))
+  "sh4_300_issue+sh4_300_fpt")
+
+;; LDS to FPSCR
+(define_insn_reservation "sh4_300_fpscr_load" 8
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "gp_fpscr"))
+  "sh4_300_issue+sh4_300_fpu+sh4_300_fpt")
+
+;; LDS.L to FPSCR
+(define_insn_reservation "sh4_300_fpscr_load_mem" 8
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type"  "mem_fpscr"))
+  "sh4_300_issue+sh4_300_fpu+sh4_300_fpt+sh4_300_ls")
+
+
+;; Fixed point multiplication (DMULS.L DMULU.L MUL.L MULS.W,MULU.W)
+(define_insn_reservation "multi" 2
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "smpy,dmpy"))
+  "sh4_300_issue+sh4_300_mul")
+
+;; FPCHG, FRCHG, FSCHG
+(define_insn_reservation "fpscr_toggle"  1
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "fpscr_toggle"))
+  "sh4_300_issue+sh4_300_fpu+sh4_300_fpt")
+
+;; FCMP/EQ, FCMP/GT
+(define_insn_reservation "fp_cmp"  3
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "fp_cmp,dfp_cmp"))
+  "sh4_300_issue+sh4_300_fpu")
+
+;; Single precision floating point (FADD,FLOAT,FMAC,FMUL,FSUB,FTRC)
+;; Double-precision floating-point (FADD,FCNVDS,FCNVSD,FLOAT,FSUB,FTRC)
+(define_insn_reservation "fp_arith"  6
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "fp,ftrc_s,dfp_arith,dfp_conv"))
+  "sh4_300_issue+sh4_300_fpu")
+
+;; Single Precision FDIV/SQRT
+(define_insn_reservation "fp_div" 19
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "fdiv"))
+  "sh4_300_issue+sh4_300_fpu+sh4_300_fds,sh4_300_fds*15")
+
+;; Double-precision floating-point FMUL
+(define_insn_reservation "dfp_mul" 9
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "dfp_mul"))
+  "sh4_300_issue+sh4_300_fpu,sh4_300_fpu*3")
+
+;; Double precision FDIV/SQRT
+(define_insn_reservation "dp_div" 35
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "dfdiv"))
+  "sh4_300_issue+sh4_300_fpu+sh4_300_fds,sh4_300_fds*31")
+
+
+;; ??? We don't really want these for sh4-300.
+;; this pattern itself is likely to finish in 3 cycles, but also
+;; to disrupt branch prediction for taken branches for the following
+;; condbranch.
+(define_insn_reservation "sh4_300_arith3" 5
+  (and (eq_attr "pipe_model" "sh4_300")
+       (eq_attr "type" "arith3"))
+  "sh4_300_issue,all*4")
+
+;; arith3b insns without brach redirection make use of the 0-offset 0-latency
+;; branch feature, and thus schedule the same no matter if the branch is taken
+;; or not.  If the branch is redirected, the taken branch might take longer,
+;; but then, we don't have to take the next branch.
+;; ??? should we suppress branch redirection for sh4-300 to improve branch
+;; target hit rates?
+(define_insn_reservation "arith3b" 2
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "arith3"))
+  "issue,all")
diff --git a/gcc/config/sh/sh4.md b/gcc/config/sh/sh4.md
index 0937db8e6a3..b390ab99d05 100644
--- a/gcc/config/sh/sh4.md
+++ b/gcc/config/sh/sh4.md
@@ -1,5 +1,5 @@
 ;; DFA scheduling description for SH4.
-;; Copyright (C) 2004 Free Software Foundation, Inc.
+;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
 
 ;; This file is part of GCC.
 
@@ -209,9 +209,14 @@
 
 (define_insn_reservation "sh4_store" 1
   (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "store"))
+       (eq_attr "type" "store,fstore"))
   "issue+load_store,nothing,memory")
 
+(define_insn_reservation "mac_mem" 1
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "mac_mem"))
+  "d_lock,nothing,memory")
+
 ;; Load Store instructions.
 ;; Group:	LS
 ;; Latency: 	1
@@ -372,35 +377,42 @@
 ;; Fixed point multiplication (DMULS.L DMULU.L MUL.L MULS.W,MULU.W)
 ;; Group:	CO
 ;; Latency: 	4 / 4
-;; Issue Rate: 	1
+;; Issue Rate: 	2
 
 (define_insn_reservation "multi" 4
   (and (eq_attr "pipe_model" "sh4")
        (eq_attr "type" "smpy,dmpy"))
   "d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2")
 
-;; Fixed STS from MACL / MACH
+;; Fixed STS from, and LDS to MACL / MACH
 ;; Group:	CO
 ;; Latency: 	3
 ;; Issue Rate: 	1
 
 (define_insn_reservation "sh4_mac_gp" 3
   (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "mac_gp"))
+       (eq_attr "type" "mac_gp,gp_mac,mem_mac"))
   "d_lock")
 
 
 ;; Single precision floating point computation FCMP/EQ,
-;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRVHG, FSCHG
+;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRCHG, FSCHG
 ;; Group:	FE
 ;; Latency: 	3/4
 ;; Issue Rate: 	1
 
 (define_insn_reservation "fp_arith"  3
   (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "fp"))
+       (eq_attr "type" "fp,fp_cmp"))
   "issue,F01,F2")
 
+;; We don't model the resource usage of this exactly because that would
+;; introduce a bogus latency.
+(define_insn_reservation "sh4_fpscr_toggle"  1
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fpscr_toggle"))
+  "issue")
+
 (define_insn_reservation "fp_arith_ftrc"  3
   (and (eq_attr "pipe_model" "sh4")
        (eq_attr "type" "ftrc_s"))
@@ -437,7 +449,7 @@
 
 (define_insn_reservation "fp_double_arith" 8
   (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "dfp_arith"))
+       (eq_attr "type" "dfp_arith,dfp_mul"))
   "issue,F01,F1+F2,fpu*4,F2")
 
 ;; Double-precision FCMP (FCMP/EQ,FCMP/GT)
diff --git a/gcc/config/sh/sh4a.md b/gcc/config/sh/sh4a.md
index 163a4e10d85..602c6545ae9 100644
--- a/gcc/config/sh/sh4a.md
+++ b/gcc/config/sh/sh4a.md
@@ -1,5 +1,5 @@
 ;; Scheduling description for Renesas SH4a
-;; Copyright (C) 2003, 2004 Free Software Foundation, Inc.
+;; Copyright (C) 2003, 2004, 2006 Free Software Foundation, Inc.
 ;;
 ;; This file is part of GCC.
 ;;
@@ -98,9 +98,11 @@
 ;; MOV
 ;; Group: MT
 ;; Latency: 0
+;; ??? not sure if movi8 belongs here, but that's where it was
+;; effectively before.
 (define_insn_reservation "sh4a_mov" 0
   (and (eq_attr "cpu" "sh4a")
-       (eq_attr "type" "move"))
+       (eq_attr "type" "move,movi8,gp_mac"))
   "ID_or")
 
 ;; Load
@@ -108,7 +110,7 @@
 ;; Latency: 3
 (define_insn_reservation "sh4a_load" 3
   (and (eq_attr "cpu" "sh4a")
-       (eq_attr "type" "load,pcload"))
+       (eq_attr "type" "load,pcload,mem_mac"))
   "sh4a_ls+sh4a_memory")
 
 (define_insn_reservation "sh4a_load_si" 3
@@ -121,7 +123,7 @@
 ;; Latency: 0
 (define_insn_reservation "sh4a_store" 0
   (and (eq_attr "cpu" "sh4a")
-       (eq_attr "type" "store"))
+       (eq_attr "type" "store,fstore,mac_mem"))
   "sh4a_ls+sh4a_memory")
 
 ;; CWB TYPE
@@ -177,7 +179,7 @@
 ;; Latency: 	3
 (define_insn_reservation "sh4a_fp_arith"  3
   (and (eq_attr "cpu" "sh4a")
-       (eq_attr "type" "fp"))
+       (eq_attr "type" "fp,fp_cmp,fpscr_toggle"))
   "ID_or,sh4a_fex")
 
 (define_insn_reservation "sh4a_fp_arith_ftrc"  3
@@ -207,7 +209,7 @@
 ;; Latency: 	5
 (define_insn_reservation "sh4a_fp_double_arith" 5
   (and (eq_attr "cpu" "sh4a")
-       (eq_attr "type" "dfp_arith"))
+       (eq_attr "type" "dfp_arith,dfp_mul"))
   "ID_or,sh4a_fex*3")
 
 ;; Double precision FDIV/SQRT
diff --git a/gcc/config/sh/superh.h b/gcc/config/sh/superh.h
index 49bb6206d43..65154926e33 100644
--- a/gcc/config/sh/superh.h
+++ b/gcc/config/sh/superh.h
@@ -75,17 +75,17 @@ Boston, MA 02110-1301, USA.  */
    on newlib and provide the runtime support */
 #undef SUBTARGET_CPP_SPEC
 #define SUBTARGET_CPP_SPEC \
-"-D__EMBEDDED_CROSS__ %{m4-100*:-D__SH4_100__} %{m4-200*:-D__SH4_200__} %{m4-400:-D__SH4_400__} %{m4-500:-D__SH4_500__} \
+"-D__EMBEDDED_CROSS__ %{m4-100*:-D__SH4_100__} %{m4-200*:-D__SH4_200__} %{m4-300*:-D__SH4_300__} %{m4-340:-D__SH4_340__} %{m4-400:-D__SH4_400__} %{m4-500:-D__SH4_500__} \
 %(cppruntime)"
 
 /* Override the SUBTARGET_ASM_SPEC to add the runtime support */
 #undef SUBTARGET_ASM_SPEC
-#define SUBTARGET_ASM_SPEC "%{m4-100*|m4-200*:-isa=sh4} %{m4-400:-isa=sh4-nommu-nofpu} %{m4-500:-isa=sh4-nofpu} %(asruntime)"
+#define SUBTARGET_ASM_SPEC "%{m4-100*|m4-200*:-isa=sh4} %{m4-400|m4-340:-isa=sh4-nommu-nofpu} %{m4-500:-isa=sh4-nofpu} %(asruntime)"
 
 /* Override the SUBTARGET_ASM_RELAX_SPEC so it doesn't interfere with the
    runtime support by adding -isa=sh4 in the wrong place.  */
 #undef SUBTARGET_ASM_RELAX_SPEC
-#define SUBTARGET_ASM_RELAX_SPEC "%{!m4-100*:%{!m4-200*:%{!m4-400:%{!m4-500:-isa=sh4}}}}"
+#define SUBTARGET_ASM_RELAX_SPEC "%{!m4-100*:%{!m4-200*:%{!m4-300*:%{!m4-340:%{!m4-400:%{!m4-500:-isa=sh4}}}}}}"
 
 /* Create the CC1_SPEC to add the runtime support */
 #undef CC1_SPEC
@@ -102,7 +102,7 @@ Boston, MA 02110-1301, USA.  */
 /* Override STARTFILE_SPEC to add profiling and MMU support.  */
 #undef STARTFILE_SPEC
 #define STARTFILE_SPEC \
-  "%{!shared: %{!m4-400*: %{pg:gcrt1-mmu.o%s}%{!pg:crt1-mmu.o%s}}} \
-   %{!shared: %{m4-400*: %{pg:gcrt1.o%s}%{!pg:crt1.o%s}}} \
+  "%{!shared: %{!m4-400*:%{!m4-340*: %{pg:gcrt1-mmu.o%s}%{!pg:crt1-mmu.o%s}}}} \
+   %{!shared: %{m4-340*|m4-400*: %{pg:gcrt1.o%s}%{!pg:crt1.o%s}}} \
    crti.o%s \
    %{!shared:crtbegin.o%s} %{shared:crtbeginS.o%s}"
diff --git a/gcc/config/sh/t-sh b/gcc/config/sh/t-sh
index 3ebc09d6e3c..56b6ba1c55a 100644
--- a/gcc/config/sh/t-sh
+++ b/gcc/config/sh/t-sh
@@ -38,11 +38,12 @@ MULTILIB_DIRNAMES=
 # is why sh2a and sh2a-single need their own multilibs.
 MULTILIB_MATCHES = $(shell \
   multilibs="$(MULTILIB_OPTIONS)" ; \
-  for abi in m1,m2,m3,m4-nofpu,m4-400,m4-500,m4al,m4a-nofpu m1,m2,m2a-nofpu \
-             m2e,m3e,m4-single-only,m4-100-single-only,m4-200-single-only,m4a-single-only \
+  for abi in m1,m2,m3,m4-nofpu,m4-100-nofpu,m4-200-nofpu,m4-400,m4-500,m4-340,m4-300-nofpu,m4al,m4a-nofpu \
+             m1,m2,m2a-nofpu \
+             m2e,m3e,m4-single-only,m4-100-single-only,m4-200-single-only,m4-300-single-only,m4a-single-only \
              m2e,m2a-single-only \
-             m4-single,m4-100-single,m4-200-single,m4a-single \
-             m4,m4-100,m4-200,m4a \
+             m4-single,m4-100-single,m4-200-single,m4-300-single,m4a-single \
+             m4,m4-100,m4-200,m4-300,m4a \
              m5-32media,m5-compact,m5-32media \
              m5-32media-nofpu,m5-compact-nofpu,m5-32media-nofpu; do \
     subst= ; \
@@ -76,7 +77,7 @@ gt-sh.h : s-gtype ; @true
 
 IC_EXTRA_PARTS= libic_invalidate_array_4-100.a libic_invalidate_array_4-200.a \
 libic_invalidate_array_4a.a
-OPT_EXTRA_PARTS= libgcc-Os-4-200.a
+OPT_EXTRA_PARTS= libgcc-Os-4-200.a libgcc-4-300.a
 EXTRA_MULTILIB_PARTS= $(IC_EXTRA_PARTS) $(OPT_EXTRA_PARTS)
 
 $(T)ic_invalidate_array_4-100.o: $(srcdir)/config/sh/lib1funcs.asm $(GCC_PASSES)
@@ -104,6 +105,12 @@ OBJS_Os_4_200=$(T)sdivsi3_i4i-Os-4-200.o $(T)udivsi3_i4i-Os-4-200.o $(T)unwind-d
 $(T)libgcc-Os-4-200.a: $(OBJS_Os_4_200) $(GCC_PASSES)
 	$(AR_CREATE_FOR_TARGET) $@ $(OBJS_Os_4_200)
 
+$(T)div_table-4-300.o: $(srcdir)/config/sh/lib1funcs-4-300.asm $(GCC_PASSES)
+	$(GCC_FOR_TARGET) $(MULTILIB_CFLAGS) -c -o $@ -DL_div_table -x assembler-with-cpp $<
+
+$(T)libgcc-4-300.a: $(T)div_table-4-300.o $(GCC_PASSES)
+	$(AR_CREATE_FOR_TARGET) $@ $(T)div_table-4-300.o
+
 # Local Variables:
 # mode: Makefile
 # End: