1 files changed, 1378 insertions, 1287 deletions
diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c
index 3009d1c1200..039d11aaac7 100644
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@@ -47,6 +47,7 @@ Boston, MA 02111-1307, USA.  */
 #include "target.h"
 #include "target-def.h"
 #include "tm_p.h"
+#include "hashtab.h"
 
 /* This is used for communication between ASM_OUTPUT_LABEL and
    ASM_OUTPUT_LABELREF.  */
@@ -103,6 +104,12 @@ int ia64_tls_size = 22;
 /* String used with the -mtls-size= option.  */
 const char *ia64_tls_size_string;
 
+/* Which cpu are we scheduling for.  */
+enum processor_type ia64_tune;
+
+/* String used with the -tune= option.  */
+const char *ia64_tune_string;
+
 /* Determines whether we run our final scheduling pass or not.  We always
    avoid the normal second scheduling pass.  */
 static int ia64_flag_schedule_insns2;
@@ -111,7 +118,19 @@ static int ia64_flag_schedule_insns2;
    sections.  */
 
 unsigned int ia64_section_threshold;
+
+/* The following variable is used by the DFA insn scheduler.  The value is
+   TRUE if we do insn bundling instead of insn scheduling.  */
+int bundling_p = 0;
+
 
+static int ia64_use_dfa_pipeline_interface PARAMS ((void));
+static int ia64_first_cycle_multipass_dfa_lookahead PARAMS ((void));
+static void ia64_dependencies_evaluation_hook PARAMS ((rtx, rtx));
+static void ia64_init_dfa_pre_cycle_insn PARAMS ((void));
+static rtx ia64_dfa_pre_cycle_insn PARAMS ((void));
+static int ia64_first_cycle_multipass_dfa_lookahead_guard PARAMS ((rtx));
+static int ia64_dfa_new_cycle PARAMS ((FILE *, int, rtx, int, int, int *));
 static rtx gen_tls_get_addr PARAMS ((void));
 static rtx gen_thread_pointer PARAMS ((void));
 static int find_gr_spill PARAMS ((int));
@@ -132,6 +151,7 @@ static void fix_range PARAMS ((const char *));
 static struct machine_function * ia64_init_machine_status PARAMS ((void));
 static void emit_insn_group_barriers PARAMS ((FILE *, rtx));
 static void emit_all_insn_group_barriers PARAMS ((FILE *, rtx));
+static void final_emit_insn_group_barriers PARAMS ((FILE *));
 static void emit_predicate_relation_info PARAMS ((void));
 static bool ia64_in_small_data_p PARAMS ((tree));
 static void ia64_encode_section_info PARAMS ((tree, int));
@@ -157,12 +177,31 @@ static int ia64_issue_rate PARAMS ((void));
 static int ia64_adjust_cost PARAMS ((rtx, rtx, rtx, int));
 static void ia64_sched_init PARAMS ((FILE *, int, int));
 static void ia64_sched_finish PARAMS ((FILE *, int));
-static int ia64_internal_sched_reorder PARAMS ((FILE *, int, rtx *,
-						int *, int, int));
+static int ia64_dfa_sched_reorder PARAMS ((FILE *, int, rtx *, int *,
+					   int, int));
 static int ia64_sched_reorder PARAMS ((FILE *, int, rtx *, int *, int));
 static int ia64_sched_reorder2 PARAMS ((FILE *, int, rtx *, int *, int));
 static int ia64_variable_issue PARAMS ((FILE *, int, rtx, int));
 
+static struct bundle_state *get_free_bundle_state PARAMS ((void));
+static void free_bundle_state PARAMS ((struct bundle_state *));
+static void initiate_bundle_states PARAMS ((void));
+static void finish_bundle_states PARAMS ((void));
+static unsigned bundle_state_hash PARAMS ((const void *));
+static int bundle_state_eq_p PARAMS ((const void *, const void *));
+static int insert_bundle_state PARAMS ((struct bundle_state *));
+static void initiate_bundle_state_table PARAMS ((void));
+static void finish_bundle_state_table PARAMS ((void));
+static int try_issue_nops PARAMS ((struct bundle_state *, int));
+static int try_issue_insn PARAMS ((struct bundle_state *, rtx));
+static void issue_nops_and_insn PARAMS ((struct bundle_state *, int,
+					 rtx, int));
+static int get_max_pos PARAMS ((state_t));
+static int get_template PARAMS ((state_t, int));
+
+static rtx get_next_important_insn PARAMS ((rtx, rtx));
+static void bundling PARAMS ((FILE *, int, rtx, rtx));
+
 static void ia64_output_mi_thunk PARAMS ((FILE *, tree, HOST_WIDE_INT,
 					  HOST_WIDE_INT, tree));
 
@@ -244,6 +283,27 @@ static const struct attribute_spec ia64_attribute_table[] =
 #undef TARGET_SCHED_REORDER2
 #define TARGET_SCHED_REORDER2 ia64_sched_reorder2
 
+#undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
+#define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK ia64_dependencies_evaluation_hook
+
+#undef TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE
+#define TARGET_SCHED_USE_DFA_PIPELINE_INTERFACE ia64_use_dfa_pipeline_interface
+
+#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
+#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD ia64_first_cycle_multipass_dfa_lookahead
+
+#undef TARGET_SCHED_INIT_DFA_PRE_CYCLE_INSN
+#define TARGET_SCHED_INIT_DFA_PRE_CYCLE_INSN ia64_init_dfa_pre_cycle_insn
+#undef TARGET_SCHED_DFA_PRE_CYCLE_INSN
+#define TARGET_SCHED_DFA_PRE_CYCLE_INSN ia64_dfa_pre_cycle_insn
+
+#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
+#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD\
+  ia64_first_cycle_multipass_dfa_lookahead_guard
+
+#undef TARGET_SCHED_DFA_NEW_CYCLE
+#define TARGET_SCHED_DFA_NEW_CYCLE ia64_dfa_new_cycle
+
 #ifdef HAVE_AS_TLS
 #undef TARGET_HAVE_TLS
 #define TARGET_HAVE_TLS true
@@ -4209,6 +4269,23 @@ ia64_init_machine_status ()
 void
 ia64_override_options ()
 {
+  static struct pta
+    {
+      const char *const name;		/* processor name or nickname.  */
+      const enum processor_type processor;
+    }
+  const processor_alias_table[] =
+    {
+      {"itanium", PROCESSOR_ITANIUM},
+      {"itanium1", PROCESSOR_ITANIUM},
+      {"merced", PROCESSOR_ITANIUM},
+      {"itanium2", PROCESSOR_ITANIUM2},
+      {"mckinley", PROCESSOR_ITANIUM2},
+    };
+
+  int const pta_size = ARRAY_SIZE (processor_alias_table);
+  int i;
+
   if (TARGET_AUTO_PIC)
     target_flags |= MASK_CONST_GP;
 
@@ -4237,6 +4314,19 @@ ia64_override_options ()
 	ia64_tls_size = tmp;
     }
 
+  if (!ia64_tune_string)
+    ia64_tune_string = "itanium2";
+
+  for (i = 0; i < pta_size; i++)
+    if (! strcmp (ia64_tune_string, processor_alias_table[i].name))
+      {
+	ia64_tune = processor_alias_table[i].processor;
+	break;
+      }
+
+  if (i == pta_size)
+    error ("bad value (%s) for -tune= switch", ia64_tune_string);
+
   ia64_flag_schedule_insns2 = flag_schedule_insns_after_reload;
   flag_schedule_insns_after_reload = 0;
 
@@ -4249,20 +4339,9 @@ ia64_override_options ()
     real_format_for_mode[TFmode - QFmode] = &ieee_extended_intel_128_format;
 }
 
-static enum attr_itanium_requires_unit0 ia64_safe_itanium_requires_unit0 PARAMS((rtx));
 static enum attr_itanium_class ia64_safe_itanium_class PARAMS((rtx));
 static enum attr_type ia64_safe_type PARAMS((rtx));
 
-static enum attr_itanium_requires_unit0
-ia64_safe_itanium_requires_unit0 (insn)
-     rtx insn;
-{
-  if (recog_memoized (insn) >= 0)
-    return get_attr_itanium_requires_unit0 (insn);
-  else
-    return ITANIUM_REQUIRES_UNIT0_NO;
-}
-
 static enum attr_itanium_class
 ia64_safe_itanium_class (insn)
      rtx insn;
@@ -5096,7 +5175,10 @@ group_barrier_needed_p (insn)
       abort ();
     }
 
-  if (first_instruction)
+  if (first_instruction && INSN_P (insn)
+      && ia64_safe_itanium_class (insn) != ITANIUM_CLASS_IGNORE
+      && GET_CODE (PATTERN (insn)) != USE
+      && GET_CODE (PATTERN (insn)) != CLOBBER)
     {
       need_barrier = 0;
       first_instruction = 0;
@@ -5229,6 +5311,7 @@ emit_all_insn_group_barriers (dump, insns)
 	}
     }
 }
+
 
 static int errata_find_address_regs PARAMS ((rtx *, void *));
 static void errata_emit_nops PARAMS ((rtx));
@@ -5374,92 +5457,92 @@ fixup_errata ()
     }
 }
 
-/* Instruction scheduling support.  */
-/* Describe one bundle.  */
 
-struct bundle
-{
-  /* Zero if there's no possibility of a stop in this bundle other than
-     at the end, otherwise the position of the optional stop bit.  */
-  int possible_stop;
-  /* The types of the three slots.  */
-  enum attr_type t[3];
-  /* The pseudo op to be emitted into the assembler output.  */
-  const char *name;
-};
+/* Instruction scheduling support.  */
 
 #define NR_BUNDLES 10
 
-/* A list of all available bundles.  */
+/* A list of names of all available bundles.  */
 
-static const struct bundle bundle[NR_BUNDLES] =
+static const char *bundle_name [NR_BUNDLES] =
 {
-  { 2, { TYPE_M, TYPE_I, TYPE_I }, ".mii" },
-  { 1, { TYPE_M, TYPE_M, TYPE_I }, ".mmi" },
-  { 0, { TYPE_M, TYPE_F, TYPE_I }, ".mfi" },
-  { 0, { TYPE_M, TYPE_M, TYPE_F }, ".mmf" },
+  ".mii",
+  ".mmi",
+  ".mfi",
+  ".mmf",
 #if NR_BUNDLES == 10
-  { 0, { TYPE_B, TYPE_B, TYPE_B }, ".bbb" },
-  { 0, { TYPE_M, TYPE_B, TYPE_B }, ".mbb" },
+  ".bbb",
+  ".mbb",
 #endif
-  { 0, { TYPE_M, TYPE_I, TYPE_B }, ".mib" },
-  { 0, { TYPE_M, TYPE_M, TYPE_B }, ".mmb" },
-  { 0, { TYPE_M, TYPE_F, TYPE_B }, ".mfb" },
-  /* .mfi needs to occur earlier than .mlx, so that we only generate it if
-     it matches an L type insn.  Otherwise we'll try to generate L type
-     nops.  */
-  { 0, { TYPE_M, TYPE_L, TYPE_X }, ".mlx" }
+  ".mib",
+  ".mmb",
+  ".mfb",
+  ".mlx"
 };
 
-/* Describe a packet of instructions.  Packets consist of two bundles that
-   are visible to the hardware in one scheduling window.  */
+/* Nonzero if we should insert stop bits into the schedule.  */
 
-struct ia64_packet
-{
-  const struct bundle *t1, *t2;
-  /* Precomputed value of the first split issue in this packet if a cycle
-     starts at its beginning.  */
-  int first_split;
-  /* For convenience, the insn types are replicated here so we don't have
-     to go through T1 and T2 all the time.  */
-  enum attr_type t[6];
-};
+int ia64_final_schedule = 0;
 
-/* An array containing all possible packets.  */
-#define NR_PACKETS (NR_BUNDLES * NR_BUNDLES)
-static struct ia64_packet packets[NR_PACKETS];
+/* Codes of the corrsponding quieryied units: */
 
-/* Map attr_type to a string with the name.  */
+static int _0mii_, _0mmi_, _0mfi_, _0mmf_;
+static int _0bbb_, _0mbb_, _0mib_, _0mmb_, _0mfb_, _0mlx_;
 
-static const char *const type_names[] =
-{
-  "UNKNOWN", "A", "I", "M", "F", "B", "L", "X", "S"
-};
+static int _1mii_, _1mmi_, _1mfi_, _1mmf_;
+static int _1bbb_, _1mbb_, _1mib_, _1mmb_, _1mfb_, _1mlx_;
 
-/* Nonzero if we should insert stop bits into the schedule.  */
-int ia64_final_schedule = 0;
+static int pos_1, pos_2, pos_3, pos_4, pos_5, pos_6;
+
+/* The following variable value is an insn group barrier.  */
+
+static rtx dfa_stop_insn;
+
+/* The following variable value is the last issued insn.  */
+
+static rtx last_scheduled_insn;
+
+/* The following variable value is size of the DFA state.  */
+
+static size_t dfa_state_size;
+
+/* The following variable value is pointer to a DFA state used as
+   temporary variable.  */
+
+static state_t temp_dfa_state = NULL;
+
+/* The following variable value is DFA state after issuing the last
+   insn.  */
+
+static state_t prev_cycle_state = NULL;
+
+/* The following array element values are TRUE if the corresponding
+   insn reuqires to add stop bits before it.  */
+
+static char *stops_p;
+
+/* The following variable is used to set up the mentioned above array.  */
+
+static int stop_before_p = 0;
+
+/* The following variable value is length of the arrays `clocks' and
+   `add_cycles'. */
+
+static int clocks_length;
+
+/* The following array element values are cycles on which the
+   corresponding insn will be issued.  The array is used only for
+   Itanium1.  */
+
+static int *clocks;
+
+/* The following array element values are numbers of cycles should be
+   added to improve insn scheduling for MM_insns for Itanium1.  */
+
+static int *add_cycles;
 
-static int itanium_split_issue PARAMS ((const struct ia64_packet *, int));
 static rtx ia64_single_set PARAMS ((rtx));
-static int insn_matches_slot PARAMS ((const struct ia64_packet *, enum attr_type, int, rtx));
 static void ia64_emit_insn_before PARAMS ((rtx, rtx));
-static void maybe_rotate PARAMS ((FILE *));
-static void finish_last_head PARAMS ((FILE *, int));
-static void rotate_one_bundle PARAMS ((FILE *));
-static void rotate_two_bundles PARAMS ((FILE *));
-static void nop_cycles_until PARAMS ((int, FILE *));
-static void cycle_end_fill_slots PARAMS ((FILE *));
-static int packet_matches_p PARAMS ((const struct ia64_packet *, int, int *));
-static int get_split PARAMS ((const struct ia64_packet *, int));
-static int find_best_insn PARAMS ((rtx *, enum attr_type *, int,
-				   const struct ia64_packet *, int));
-static void find_best_packet PARAMS ((int *, const struct ia64_packet **,
-				      rtx *, enum attr_type *, int));
-static int itanium_reorder PARAMS ((FILE *, rtx *, rtx *, int));
-static void dump_current_packet PARAMS ((FILE *));
-static void schedule_stop PARAMS ((FILE *));
-static rtx gen_nop_type PARAMS ((enum attr_type));
-static void ia64_emit_nops PARAMS ((void));
 
 /* Map a bundle number to its pseudo-op.  */
 
@@ -5467,55 +5550,9 @@ const char *
 get_bundle_name (b)
      int b;
 {
-  return bundle[b].name;
+  return bundle_name[b];
 }
 
-/* Compute the slot which will cause a split issue in packet P if the
-   current cycle begins at slot BEGIN.  */
-
-static int
-itanium_split_issue (p, begin)
-     const struct ia64_packet *p;
-     int begin;
-{
-  int type_count[TYPE_S];
-  int i;
-  int split = 6;
-
-  if (begin < 3)
-    {
-      /* Always split before and after MMF.  */
-      if (p->t[0] == TYPE_M && p->t[1] == TYPE_M && p->t[2] == TYPE_F)
-	return 3;
-      if (p->t[3] == TYPE_M && p->t[4] == TYPE_M && p->t[5] == TYPE_F)
-	return 3;
-      /* Always split after MBB and BBB.  */
-      if (p->t[1] == TYPE_B)
-	return 3;
-      /* Split after first bundle in MIB BBB combination.  */
-      if (p->t[2] == TYPE_B && p->t[3] == TYPE_B)
-	return 3;
-    }
-
-  memset (type_count, 0, sizeof type_count);
-  for (i = begin; i < split; i++)
-    {
-      enum attr_type t0 = p->t[i];
-      /* An MLX bundle reserves the same units as an MFI bundle.  */
-      enum attr_type t = (t0 == TYPE_L ? TYPE_F
-			  : t0 == TYPE_X ? TYPE_I
-			  : t0);
-
-      /* Itanium can execute up to 3 branches, 2 floating point, 2 memory, and
-	 2 integer per cycle.  */
-      int max = (t == TYPE_B ? 3 : 2);
-      if (type_count[t] == max)
-	return i;
-
-      type_count[t]++;
-    }
-  return split;
-}
 
 /* Return the maximum number of instructions a cpu can issue.  */
 
@@ -5563,208 +5600,21 @@ ia64_adjust_cost (insn, link, dep_insn, cost)
      rtx insn, link, dep_insn;
      int cost;
 {
-  enum attr_type dep_type;
   enum attr_itanium_class dep_class;
   enum attr_itanium_class insn_class;
-  rtx dep_set, set, src, addr;
-
-  if (GET_CODE (PATTERN (insn)) == CLOBBER
-      || GET_CODE (PATTERN (insn)) == USE
-      || GET_CODE (PATTERN (dep_insn)) == CLOBBER
-      || GET_CODE (PATTERN (dep_insn)) == USE
-      /* @@@ Not accurate for indirect calls.  */
-      || GET_CODE (insn) == CALL_INSN
-      || ia64_safe_type (insn) == TYPE_S)
-    return 0;
 
-  if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT
-      || REG_NOTE_KIND (link) == REG_DEP_ANTI)
-    return 0;
+  if (REG_NOTE_KIND (link) != REG_DEP_OUTPUT)
+    return cost;
 
-  dep_type = ia64_safe_type (dep_insn);
-  dep_class = ia64_safe_itanium_class (dep_insn);
   insn_class = ia64_safe_itanium_class (insn);
-
-  /* Compares that feed a conditional branch can execute in the same
-     cycle.  */
-  dep_set = ia64_single_set (dep_insn);
-  set = ia64_single_set (insn);
-
-  if (dep_type != TYPE_F
-      && dep_set
-      && GET_CODE (SET_DEST (dep_set)) == REG
-      && PR_REG (REGNO (SET_DEST (dep_set)))
-      && GET_CODE (insn) == JUMP_INSN)
+  dep_class = ia64_safe_itanium_class (dep_insn);
+  if (dep_class == ITANIUM_CLASS_ST || dep_class == ITANIUM_CLASS_STF
+      || insn_class == ITANIUM_CLASS_ST || insn_class == ITANIUM_CLASS_STF)
     return 0;
 
-  if (dep_set && GET_CODE (SET_DEST (dep_set)) == MEM)
-    {
-      /* ??? Can't find any information in the documenation about whether
-	 a sequence
-	   st [rx] = ra
-	   ld rb = [ry]
-	 splits issue.  Assume it doesn't.  */
-      return 0;
-    }
-
-  src = set ? SET_SRC (set) : 0;
-  addr = 0;
-  if (set)
-    {
-      if (GET_CODE (SET_DEST (set)) == MEM)
-	addr = XEXP (SET_DEST (set), 0);
-      else if (GET_CODE (SET_DEST (set)) == SUBREG
-	       && GET_CODE (SUBREG_REG (SET_DEST (set))) == MEM)
-	addr = XEXP (SUBREG_REG (SET_DEST (set)), 0);
-      else
-	{
-	  addr = src;
-	  if (GET_CODE (addr) == UNSPEC && XVECLEN (addr, 0) > 0)
-	    addr = XVECEXP (addr, 0, 0);
-	  while (GET_CODE (addr) == SUBREG || GET_CODE (addr) == ZERO_EXTEND)
-	    addr = XEXP (addr, 0);
-	  if (GET_CODE (addr) == MEM)
-	    addr = XEXP (addr, 0);
-	  else
-	    addr = 0;
-	}
-    }
-
-  if (addr && GET_CODE (addr) == POST_MODIFY)
-    addr = XEXP (addr, 0);
-
-  set = ia64_single_set (dep_insn);
-
-  if ((dep_class == ITANIUM_CLASS_IALU
-       || dep_class == ITANIUM_CLASS_ILOG
-       || dep_class == ITANIUM_CLASS_LD)
-      && (insn_class == ITANIUM_CLASS_LD
-	  || insn_class == ITANIUM_CLASS_ST))
-    {
-      if (! addr || ! set)
-	abort ();
-      /* This isn't completely correct - an IALU that feeds an address has
-	 a latency of 1 cycle if it's issued in an M slot, but 2 cycles
-	 otherwise.  Unfortunately there's no good way to describe this.  */
-      if (reg_overlap_mentioned_p (SET_DEST (set), addr))
-	return cost + 1;
-    }
-
-  if ((dep_class == ITANIUM_CLASS_IALU
-       || dep_class == ITANIUM_CLASS_ILOG
-       || dep_class == ITANIUM_CLASS_LD)
-      && (insn_class == ITANIUM_CLASS_MMMUL
-	  || insn_class == ITANIUM_CLASS_MMSHF
-	  || insn_class == ITANIUM_CLASS_MMSHFI))
-    return 3;
-
-  if (dep_class == ITANIUM_CLASS_FMAC
-      && (insn_class == ITANIUM_CLASS_FMISC
-	  || insn_class == ITANIUM_CLASS_FCVTFX
-	  || insn_class == ITANIUM_CLASS_XMPY))
-    return 7;
-
-  if ((dep_class == ITANIUM_CLASS_FMAC
-       || dep_class == ITANIUM_CLASS_FMISC
-       || dep_class == ITANIUM_CLASS_FCVTFX
-       || dep_class == ITANIUM_CLASS_XMPY)
-      && insn_class == ITANIUM_CLASS_STF)
-    return 8;
-
-  /* Intel docs say only LD, ST, IALU, ILOG, ISHF consumers have latency 4,
-     but HP engineers say any non-MM operation.  */
-  if ((dep_class == ITANIUM_CLASS_MMMUL
-       || dep_class == ITANIUM_CLASS_MMSHF
-       || dep_class == ITANIUM_CLASS_MMSHFI)
-      && insn_class != ITANIUM_CLASS_MMMUL
-      && insn_class != ITANIUM_CLASS_MMSHF
-      && insn_class != ITANIUM_CLASS_MMSHFI)
-    return 4;
-
   return cost;
 }
 
-/* Describe the current state of the Itanium pipeline.  */
-static struct
-{
-  /* The first slot that is used in the current cycle.  */
-  int first_slot;
-  /* The next slot to fill.  */
-  int cur;
-  /* The packet we have selected for the current issue window.  */
-  const struct ia64_packet *packet;
-  /* The position of the split issue that occurs due to issue width
-     limitations (6 if there's no split issue).  */
-  int split;
-  /* Record data about the insns scheduled so far in the same issue
-     window.  The elements up to but not including FIRST_SLOT belong
-     to the previous cycle, the ones starting with FIRST_SLOT belong
-     to the current cycle.  */
-  enum attr_type types[6];
-  rtx insns[6];
-  int stopbit[6];
-  /* Nonzero if we decided to schedule a stop bit.  */
-  int last_was_stop;
-} sched_data;
-
-/* Temporary arrays; they have enough elements to hold all insns that
-   can be ready at the same time while scheduling of the current block.
-   SCHED_READY can hold ready insns, SCHED_TYPES their types.  */
-static rtx *sched_ready;
-static enum attr_type *sched_types;
-
-/* Determine whether an insn INSN of type ITYPE can fit into slot SLOT
-   of packet P.  */
-
-static int
-insn_matches_slot (p, itype, slot, insn)
-     const struct ia64_packet *p;
-     enum attr_type itype;
-     int slot;
-     rtx insn;
-{
-  enum attr_itanium_requires_unit0 u0;
-  enum attr_type stype = p->t[slot];
-
-  if (insn)
-    {
-      u0 = ia64_safe_itanium_requires_unit0 (insn);
-      if (u0 == ITANIUM_REQUIRES_UNIT0_YES)
-	{
-	  int i;
-	  for (i = sched_data.first_slot; i < slot; i++)
-	    if (p->t[i] == stype
-		|| (stype == TYPE_F && p->t[i] == TYPE_L)
-		|| (stype == TYPE_I && p->t[i] == TYPE_X))
-	      return 0;
-	}
-      if (GET_CODE (insn) == CALL_INSN)
-	{
-	  /* Reject calls in multiway branch packets.  We want to limit
-	     the number of multiway branches we generate (since the branch
-	     predictor is limited), and this seems to work fairly well.
-	     (If we didn't do this, we'd have to add another test here to
-	     force calls into the third slot of the bundle.)  */
-	  if (slot < 3)
-	    {
-	      if (p->t[1] == TYPE_B)
-		return 0;
-	    }
-	  else
-	    {
-	      if (p->t[4] == TYPE_B)
-		return 0;
-	    }
-	}
-    }
-
-  if (itype == stype)
-    return 1;
-  if (itype == TYPE_A)
-    return stype == TYPE_M || stype == TYPE_I;
-  return 0;
-}
-
 /* Like emit_insn_before, but skip cycle_display notes.
    ??? When cycle display notes are implemented, update this.  */
 
@@ -5775,1055 +5625,1324 @@ ia64_emit_insn_before (insn, before)
   emit_insn_before (insn, before);
 }
 
-/* When rotating a bundle out of the issue window, insert a bundle selector
-   insn in front of it.  DUMP is the scheduling dump file or NULL.  START
-   is either 0 or 3, depending on whether we want to emit a bundle selector
-   for the first bundle or the second bundle in the current issue window.
-
-   The selector insns are emitted this late because the selected packet can
-   be changed until parts of it get rotated out.  */
+/* The following function marks insns who produce addresses for load
+   and store insns.  Such insns will be placed into M slots because it
+   decrease latency time for Itanium1 (see function
+   `ia64_produce_address_p' and the DFA descriptions).  */
 
 static void
-finish_last_head (dump, start)
-     FILE *dump;
-     int start;
+ia64_dependencies_evaluation_hook (head, tail)
+     rtx head, tail;
 {
-  const struct ia64_packet *p = sched_data.packet;
-  const struct bundle *b = start == 0 ? p->t1 : p->t2;
-  int bundle_type = b - bundle;
-  rtx insn;
-  int i;
-
-  if (! ia64_final_schedule)
-    return;
-
-  for (i = start; sched_data.insns[i] == 0; i++)
-    if (i == start + 3)
-      abort ();
-  insn = sched_data.insns[i];
+  rtx insn, link, next, next_tail;
+  
+  next_tail = NEXT_INSN (tail);
+  for (insn = head; insn != next_tail; insn = NEXT_INSN (insn))
+    if (INSN_P (insn))
+      insn->call = 0;
+  for (insn = head; insn != next_tail; insn = NEXT_INSN (insn))
+    if (INSN_P (insn)
+	&& ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IALU)
+      {
+	for (link = INSN_DEPEND (insn); link != 0; link = XEXP (link, 1))
+	  {
+	    next = XEXP (link, 0);
+	    if ((ia64_safe_itanium_class (next) == ITANIUM_CLASS_ST
+		 || ia64_safe_itanium_class (next) == ITANIUM_CLASS_STF)
+		&& ia64_st_address_bypass_p (insn, next))
+	      break;
+	    else if ((ia64_safe_itanium_class (next) == ITANIUM_CLASS_LD
+		      || ia64_safe_itanium_class (next)
+		      == ITANIUM_CLASS_FLD)
+		     && ia64_ld_address_bypass_p (insn, next))
+	      break;
+	  }
+	insn->call = link != 0;
+      }
+}
 
-  if (dump)
-    fprintf (dump, "//    Emitting template before %d: %s\n",
-	     INSN_UID (insn), b->name);
+/* We're beginning a new block.  Initialize data structures as necessary.  */
 
-  ia64_emit_insn_before (gen_bundle_selector (GEN_INT (bundle_type)), insn);
+static void
+ia64_sched_init (dump, sched_verbose, max_ready)
+     FILE *dump ATTRIBUTE_UNUSED;
+     int sched_verbose ATTRIBUTE_UNUSED;
+     int max_ready ATTRIBUTE_UNUSED;
+{
+#ifdef ENABLE_CHECKING
+  rtx insn;
+  
+  if (reload_completed)
+    for (insn = NEXT_INSN (current_sched_info->prev_head);
+	 insn != current_sched_info->next_tail;
+	 insn = NEXT_INSN (insn))
+      if (SCHED_GROUP_P (insn))
+	abort ();
+#endif
+  last_scheduled_insn = NULL_RTX;
+  init_insn_group_barriers ();
 }
 
-/* We can't schedule more insns this cycle.  Fix up the scheduling state
-   and advance FIRST_SLOT and CUR.
-   We have to distribute the insns that are currently found between
-   FIRST_SLOT and CUR into the slots of the packet we have selected.  So
-   far, they are stored successively in the fields starting at FIRST_SLOT;
-   now they must be moved to the correct slots.
-   DUMP is the current scheduling dump file, or NULL.  */
+/* We are about to being issuing insns for this clock cycle.
+   Override the default sort algorithm to better slot instructions.  */
 
-static void
-cycle_end_fill_slots (dump)
+static int
+ia64_dfa_sched_reorder (dump, sched_verbose, ready, pn_ready,
+			clock_var, reorder_type)
      FILE *dump;
+     int sched_verbose;
+     rtx *ready;
+     int *pn_ready;
+     int clock_var  ATTRIBUTE_UNUSED;
+     int reorder_type;
 {
-  const struct ia64_packet *packet = sched_data.packet;
-  int slot, i;
-  enum attr_type tmp_types[6];
-  rtx tmp_insns[6];
+  int n_asms;
+  int n_ready = *pn_ready;
+  rtx *e_ready = ready + n_ready;
+  rtx *insnp;
 
-  memcpy (tmp_types, sched_data.types, 6 * sizeof (enum attr_type));
-  memcpy (tmp_insns, sched_data.insns, 6 * sizeof (rtx));
+  if (sched_verbose)
+    fprintf (dump, "// ia64_dfa_sched_reorder (type %d):\n", reorder_type);
 
-  for (i = slot = sched_data.first_slot; i < sched_data.cur; i++)
+  if (reorder_type == 0)
     {
-      enum attr_type t = tmp_types[i];
-      if (t != ia64_safe_type (tmp_insns[i]))
-	abort ();
-      while (! insn_matches_slot (packet, t, slot, tmp_insns[i]))
-	{
-	  if (slot > sched_data.split)
-	    abort ();
-	  if (dump)
-	    fprintf (dump, "// Packet needs %s, have %s\n",
-		     type_names[packet->t[slot]], type_names[t]);
-	  sched_data.types[slot] = packet->t[slot];
-	  sched_data.insns[slot] = 0;
-	  sched_data.stopbit[slot] = 0;
-
-	  /* ??? TYPE_L instructions always fill up two slots, but we don't
-	     support TYPE_L nops.  */
-	  if (packet->t[slot] == TYPE_L)
-	    abort ();
-
-	  slot++;
-	}
-
-      /* Do _not_ use T here.  If T == TYPE_A, then we'd risk changing the
-	 actual slot type later.  */
-      sched_data.types[slot] = packet->t[slot];
-      sched_data.insns[slot] = tmp_insns[i];
-      sched_data.stopbit[slot] = 0;
-      slot++;
+      /* First, move all USEs, CLOBBERs and other crud out of the way.  */
+      n_asms = 0;
+      for (insnp = ready; insnp < e_ready; insnp++)
+	if (insnp < e_ready)
+	  {
+	    rtx insn = *insnp;
+	    enum attr_type t = ia64_safe_type (insn);
+	    if (t == TYPE_UNKNOWN)
+	      {
+		if (GET_CODE (PATTERN (insn)) == ASM_INPUT
+		    || asm_noperands (PATTERN (insn)) >= 0)
+		  {
+		    rtx lowest = ready[n_asms];
+		    ready[n_asms] = insn;
+		    *insnp = lowest;
+		    n_asms++;
+		  }
+		else
+		  {
+		    rtx highest = ready[n_ready - 1];
+		    ready[n_ready - 1] = insn;
+		    *insnp = highest;
+		    return 1;
+		  }
+	      }
+	  }
 
-      /* TYPE_L instructions always fill up two slots.  */
-      if (t == TYPE_L)
+      if (n_asms < n_ready)
 	{
-	  sched_data.types[slot] = packet->t[slot];
-	  sched_data.insns[slot] = 0;
-	  sched_data.stopbit[slot] = 0;
-	  slot++;
+	  /* Some normal insns to process.  Skip the asms.  */
+	  ready += n_asms;
+	  n_ready -= n_asms;
 	}
+      else if (n_ready > 0)
+	return 1;
     }
 
-  /* This isn't right - there's no need to pad out until the forced split;
-     the CPU will automatically split if an insn isn't ready.  */
-#if 0
-  while (slot < sched_data.split)
+  if (ia64_final_schedule)
     {
-      sched_data.types[slot] = packet->t[slot];
-      sched_data.insns[slot] = 0;
-      sched_data.stopbit[slot] = 0;
-      slot++;
+      int deleted = 0;
+      int nr_need_stop = 0;
+
+      for (insnp = ready; insnp < e_ready; insnp++)
+	if (safe_group_barrier_needed_p (*insnp))
+	  nr_need_stop++;
+      
+      if (reorder_type == 1 && n_ready == nr_need_stop)
+	return 0;
+      if (reorder_type == 0)
+	return 1;
+      insnp = e_ready;
+      /* Move down everything that needs a stop bit, preserving
+	 relative order.  */
+      while (insnp-- > ready + deleted)
+	while (insnp >= ready + deleted)
+	  {
+	    rtx insn = *insnp;
+	    if (! safe_group_barrier_needed_p (insn))
+	      break;
+	    memmove (ready + 1, ready, (insnp - ready) * sizeof (rtx));
+	    *ready = insn;
+	    deleted++;
+	  }
+      n_ready -= deleted;
+      ready += deleted;
     }
-#endif
 
-  sched_data.first_slot = sched_data.cur = slot;
+  return 1;
 }
 
-/* Bundle rotations, as described in the Itanium optimization manual.
-   We can rotate either one or both bundles out of the issue window.
-   DUMP is the current scheduling dump file, or NULL.  */
-
-static void
-rotate_one_bundle (dump)
-     FILE *dump;
-{
-  if (dump)
-    fprintf (dump, "// Rotating one bundle.\n");
-
-  finish_last_head (dump, 0);
-  if (sched_data.cur > 3)
-    {
-      sched_data.cur -= 3;
-      sched_data.first_slot -= 3;
-      memmove (sched_data.types,
-	       sched_data.types + 3,
-	       sched_data.cur * sizeof *sched_data.types);
-      memmove (sched_data.stopbit,
-	       sched_data.stopbit + 3,
-	       sched_data.cur * sizeof *sched_data.stopbit);
-      memmove (sched_data.insns,
-	       sched_data.insns + 3,
-	       sched_data.cur * sizeof *sched_data.insns);
-      sched_data.packet
-	= &packets[(sched_data.packet->t2 - bundle) * NR_BUNDLES];
-    }
-  else
-    {
-      sched_data.cur = 0;
-      sched_data.first_slot = 0;
-    }
-}
+/* We are about to being issuing insns for this clock cycle.  Override
+   the default sort algorithm to better slot instructions.  */
 
-static void
-rotate_two_bundles (dump)
+static int
+ia64_sched_reorder (dump, sched_verbose, ready, pn_ready, clock_var)
      FILE *dump;
+     int sched_verbose;
+     rtx *ready;
+     int *pn_ready;
+     int clock_var;
 {
-  if (dump)
-    fprintf (dump, "// Rotating two bundles.\n");
-
-  if (sched_data.cur == 0)
-    return;
-
-  finish_last_head (dump, 0);
-  if (sched_data.cur > 3)
-    finish_last_head (dump, 3);
-  sched_data.cur = 0;
-  sched_data.first_slot = 0;
+  return ia64_dfa_sched_reorder (dump, sched_verbose, ready,
+				 pn_ready, clock_var, 0);
 }
 
-/* We're beginning a new block.  Initialize data structures as necessary.  */
+/* Like ia64_sched_reorder, but called after issuing each insn.
+   Override the default sort algorithm to better slot instructions.  */
 
-static void
-ia64_sched_init (dump, sched_verbose, max_ready)
+static int
+ia64_sched_reorder2 (dump, sched_verbose, ready, pn_ready, clock_var)
      FILE *dump ATTRIBUTE_UNUSED;
      int sched_verbose ATTRIBUTE_UNUSED;
-     int max_ready;
+     rtx *ready;
+     int *pn_ready;
+     int clock_var;
 {
-  static int initialized = 0;
-
-  if (! initialized)
-    {
-      int b1, b2, i;
-
-      initialized = 1;
-
-      for (i = b1 = 0; b1 < NR_BUNDLES; b1++)
-	{
-	  const struct bundle *t1 = bundle + b1;
-	  for (b2 = 0; b2 < NR_BUNDLES; b2++, i++)
-	    {
-	      const struct bundle *t2 = bundle + b2;
-
-	      packets[i].t1 = t1;
-	      packets[i].t2 = t2;
-	    }
-	}
-      for (i = 0; i < NR_PACKETS; i++)
-	{
-	  int j;
-	  for (j = 0; j < 3; j++)
-	    packets[i].t[j] = packets[i].t1->t[j];
-	  for (j = 0; j < 3; j++)
-	    packets[i].t[j + 3] = packets[i].t2->t[j];
-	  packets[i].first_split = itanium_split_issue (packets + i, 0);
-	}
-	
-    }
-
-  init_insn_group_barriers ();
-
-  memset (&sched_data, 0, sizeof sched_data);
-  sched_types = (enum attr_type *) xmalloc (max_ready
-					    * sizeof (enum attr_type));
-  sched_ready = (rtx *) xmalloc (max_ready * sizeof (rtx));
+  if (ia64_tune == PROCESSOR_ITANIUM && reload_completed && last_scheduled_insn)
+    clocks [INSN_UID (last_scheduled_insn)] = clock_var;
+  return ia64_dfa_sched_reorder (dump, sched_verbose, ready, pn_ready,
+				 clock_var, 1);
 }
 
-/* See if the packet P can match the insns we have already scheduled.  Return
-   nonzero if so.  In *PSLOT, we store the first slot that is available for
-   more instructions if we choose this packet.
-   SPLIT holds the last slot we can use, there's a split issue after it so
-   scheduling beyond it would cause us to use more than one cycle.  */
+/* We are about to issue INSN.  Return the number of insns left on the
+   ready queue that can be issued this cycle.  */
 
 static int
-packet_matches_p (p, split, pslot)
-     const struct ia64_packet *p;
-     int split;
-     int *pslot;
-{
-  int filled = sched_data.cur;
-  int first = sched_data.first_slot;
-  int i, slot;
-
-  /* First, check if the first of the two bundles must be a specific one (due
-     to stop bits).  */
-  if (first > 0 && sched_data.stopbit[0] && p->t1->possible_stop != 1)
-    return 0;
-  if (first > 1 && sched_data.stopbit[1] && p->t1->possible_stop != 2)
-    return 0;
-
-  for (i = 0; i < first; i++)
-    if (! insn_matches_slot (p, sched_data.types[i], i,
-			     sched_data.insns[i]))
-      return 0;
-  for (i = slot = first; i < filled; i++)
+ia64_variable_issue (dump, sched_verbose, insn, can_issue_more)
+     FILE *dump ATTRIBUTE_UNUSED;
+     int sched_verbose ATTRIBUTE_UNUSED;
+     rtx insn ATTRIBUTE_UNUSED;
+     int can_issue_more ATTRIBUTE_UNUSED;
+{
+  last_scheduled_insn = insn;
+  memcpy (prev_cycle_state, curr_state, dfa_state_size);
+  if (reload_completed)
     {
-      while (slot < split)
-	{
-	  if (insn_matches_slot (p, sched_data.types[i], slot,
-				 sched_data.insns[i]))
-	    break;
-	  slot++;
-	}
-      if (slot == split)
-	return 0;
-      slot++;
+      if (group_barrier_needed_p (insn))
+	abort ();
+      if (GET_CODE (insn) == CALL_INSN)
+	init_insn_group_barriers ();
+      stops_p [INSN_UID (insn)] = stop_before_p;
+      stop_before_p = 0;
     }
-
-  if (pslot)
-    *pslot = slot;
   return 1;
 }
 
-/* A frontend for itanium_split_issue.  For a packet P and a slot
-   number FIRST that describes the start of the current clock cycle,
-   return the slot number of the first split issue.  This function
-   uses the cached number found in P if possible.  */
+/* We are choosing insn from the ready queue.  Return nonzero if INSN
+   can be chosen.  */
 
 static int
-get_split (p, first)
-     const struct ia64_packet *p;
-     int first;
+ia64_first_cycle_multipass_dfa_lookahead_guard (insn)
+     rtx insn;
 {
-  if (first == 0)
-    return p->first_split;
-  return itanium_split_issue (p, first);
+  if (insn == NULL_RTX || !INSN_P (insn))
+    abort ();
+  return (!reload_completed
+	  || !safe_group_barrier_needed_p (insn));
 }
 
-/* Given N_READY insns in the array READY, whose types are found in the
-   corresponding array TYPES, return the insn that is best suited to be
-   scheduled in slot SLOT of packet P.  */
+/* The following variable value is pseudo-insn used by the DFA insn
+   scheduler to change the DFA state when the simulated clock is
+   increased.  */
+
+static rtx dfa_pre_cycle_insn;
+
+/* We are about to being issuing INSN.  Return nonzero if we can not
+   issue it on given cycle CLOCK and return zero if we should not sort
+   the ready queue on the next clock start.  */
 
 static int
-find_best_insn (ready, types, n_ready, p, slot)
-     rtx *ready;
-     enum attr_type *types;
-     int n_ready;
-     const struct ia64_packet *p;
-     int slot;
+ia64_dfa_new_cycle (dump, verbose, insn, last_clock, clock, sort_p)
+     FILE *dump;
+     int verbose;
+     rtx insn;
+     int last_clock, clock;
+     int *sort_p;
 {
-  int best = -1;
-  int best_pri = 0;
-  while (n_ready-- > 0)
+  int setup_clocks_p = FALSE;
+
+  if (insn == NULL_RTX || !INSN_P (insn))
+    abort ();
+  if ((reload_completed && safe_group_barrier_needed_p (insn))
+      || (last_scheduled_insn
+	  && (GET_CODE (last_scheduled_insn) == CALL_INSN
+	      || GET_CODE (PATTERN (last_scheduled_insn)) == ASM_INPUT
+	      || asm_noperands (PATTERN (last_scheduled_insn)) >= 0)))
     {
-      rtx insn = ready[n_ready];
-      if (! insn)
-	continue;
-      if (best >= 0 && INSN_PRIORITY (ready[n_ready]) < best_pri)
-	break;
-      /* If we have equally good insns, one of which has a stricter
-	 slot requirement, prefer the one with the stricter requirement.  */
-      if (best >= 0 && types[n_ready] == TYPE_A)
-	continue;
-      if (insn_matches_slot (p, types[n_ready], slot, insn))
+      init_insn_group_barriers ();
+      if (verbose && dump)
+	fprintf (dump, "//    Stop should be before %d%s\n", INSN_UID (insn),
+		 last_clock == clock ? " + cycle advance" : "");
+      stop_before_p = 1;
+      if (last_clock == clock)
 	{
-	  best = n_ready;
-	  best_pri = INSN_PRIORITY (ready[best]);
-
-	  /* If there's no way we could get a stricter requirement, stop
-	     looking now.  */
-	  if (types[n_ready] != TYPE_A
-	      && ia64_safe_itanium_requires_unit0 (ready[n_ready]))
-	    break;
-	  break;
+	  state_transition (curr_state, dfa_stop_insn);
+	  if (TARGET_EARLY_STOP_BITS)
+	    *sort_p = (last_scheduled_insn == NULL_RTX
+		       || GET_CODE (last_scheduled_insn) != CALL_INSN);
+	  else
+	    *sort_p = 0;
+	  return 1;
+	}
+      else if (reload_completed)
+	setup_clocks_p = TRUE;
+      memcpy (curr_state, prev_cycle_state, dfa_state_size);
+      state_transition (curr_state, dfa_stop_insn);
+      state_transition (curr_state, dfa_pre_cycle_insn);
+      state_transition (curr_state, NULL);
+    }
+  else if (reload_completed)
+    setup_clocks_p = TRUE;
+  if (setup_clocks_p && ia64_tune == PROCESSOR_ITANIUM)
+    {
+      enum attr_itanium_class c = ia64_safe_itanium_class (insn);
+      
+      if (c != ITANIUM_CLASS_MMMUL && c != ITANIUM_CLASS_MMSHF)
+	{
+	  rtx link;
+	  int d = -1;
+	  
+	  for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
+	    if (REG_NOTE_KIND (link) == 0)
+	      {
+		enum attr_itanium_class dep_class;
+		rtx dep_insn = XEXP (link, 0);
+		
+		dep_class = ia64_safe_itanium_class (dep_insn);
+		if ((dep_class == ITANIUM_CLASS_MMMUL
+		     || dep_class == ITANIUM_CLASS_MMSHF)
+		    && last_clock - clocks [INSN_UID (dep_insn)] < 4
+		    && (d < 0
+			|| last_clock - clocks [INSN_UID (dep_insn)] < d))
+		  d = last_clock - clocks [INSN_UID (dep_insn)];
+	      }
+	  if (d >= 0)
+	    add_cycles [INSN_UID (insn)] = 3 - d;
 	}
     }
-  return best;
+  return 0;
 }
 
-/* Select the best packet to use given the current scheduler state and the
-   current ready list.
-   READY is an array holding N_READY ready insns; TYPES is a corresponding
-   array that holds their types.  Store the best packet in *PPACKET and the
-   number of insns that can be scheduled in the current cycle in *PBEST.  */
+
 
-static void
-find_best_packet (pbest, ppacket, ready, types, n_ready)
-     int *pbest;
-     const struct ia64_packet **ppacket;
-     rtx *ready;
-     enum attr_type *types;
-     int n_ready;
-{
-  int first = sched_data.first_slot;
-  int best = 0;
-  int lowest_end = 6;
-  const struct ia64_packet *best_packet = NULL;
-  int i;
+/* The following page contains abstract data `bundle states' which are
+   used for bundling insns (inserting nops and template generation).  */
+
+/* The following describes state of insn bundling.  */
+
+struct bundle_state
+{
+  /* Unique bundle state number to identify them in the debugging
+     output  */
+  int unique_num;
+  rtx insn;     /* corresponding insn, NULL for the 1st and the last state  */
+  /* number nops before and after the insn  */
+  short before_nops_num, after_nops_num;
+  int insn_num; /* insn number (0 - for initial state, 1 - for the 1st
+                   insn */
+  int cost;     /* cost of the state in cycles */
+  int accumulated_insns_num; /* number of all previous insns including
+				nops.  L is considered as 2 insns */
+  int branch_deviation; /* deviation of previous branches from 3rd slots  */
+  struct bundle_state *next;  /* next state with the same insn_num  */
+  struct bundle_state *originator; /* originator (previous insn state)  */
+  /* All bundle states are in the following chain.  */
+  struct bundle_state *allocated_states_chain;
+  /* The DFA State after issuing the insn and the nops.  */
+  state_t dfa_state;
+};
 
-  for (i = 0; i < NR_PACKETS; i++)
-    {
-      const struct ia64_packet *p = packets + i;
-      int slot;
-      int split = get_split (p, first);
-      int win = 0;
-      int first_slot, last_slot;
-      int b_nops = 0;
+/* The following is map insn number to the corresponding bundle state.  */
 
-      if (! packet_matches_p (p, split, &first_slot))
-	continue;
+static struct bundle_state **index_to_bundle_states;
 
-      memcpy (sched_ready, ready, n_ready * sizeof (rtx));
+/* The unique number of next bundle state.  */
 
-      win = 0;
-      last_slot = 6;
-      for (slot = first_slot; slot < split; slot++)
-	{
-	  int insn_nr;
+static int bundle_states_num;
 
-	  /* Disallow a degenerate case where the first bundle doesn't
-	     contain anything but NOPs!  */
-	  if (first_slot == 0 && win == 0 && slot == 3)
-	    {
-	      win = -1;
-	      break;
-	    }
+/* All allocated bundle states are in the following chain.  */
 
-	  insn_nr = find_best_insn (sched_ready, types, n_ready, p, slot);
-	  if (insn_nr >= 0)
-	    {
-	      sched_ready[insn_nr] = 0;
-	      last_slot = slot;
-	      win++;
-	    }
-	  else if (p->t[slot] == TYPE_B)
-	    b_nops++;
-	}
-      /* We must disallow MBB/BBB packets if any of their B slots would be
-	 filled with nops.  */
-      if (last_slot < 3)
-	{
-	  if (p->t[1] == TYPE_B && (b_nops || last_slot < 2))
-	    win = -1;
-	}
-      else
-	{
-	  if (p->t[4] == TYPE_B && (b_nops || last_slot < 5))
-	    win = -1;
-	}
+static struct bundle_state *allocated_bundle_states_chain;
 
-      if (win > best
-	  || (win == best && last_slot < lowest_end))
-	{
-	  best = win;
-	  lowest_end = last_slot;
-	  best_packet = p;
-	}
-    }
-  *pbest = best;
-  *ppacket = best_packet;
-}
+/* All allocated but not used bundle states are in the following
+   chain.  */
 
-/* Reorder the ready list so that the insns that can be issued in this cycle
-   are found in the correct order at the end of the list.
-   DUMP is the scheduling dump file, or NULL.  READY points to the start,
-   E_READY to the end of the ready list.  MAY_FAIL determines what should be
-   done if no insns can be scheduled in this cycle: if it is zero, we abort,
-   otherwise we return 0.
-   Return 1 if any insns can be scheduled in this cycle.  */
+static struct bundle_state *free_bundle_state_chain;
 
-static int
-itanium_reorder (dump, ready, e_ready, may_fail)
-     FILE *dump;
-     rtx *ready;
-     rtx *e_ready;
-     int may_fail;
-{
-  const struct ia64_packet *best_packet;
-  int n_ready = e_ready - ready;
-  int first = sched_data.first_slot;
-  int i, best, best_split, filled;
 
-  for (i = 0; i < n_ready; i++)
-    sched_types[i] = ia64_safe_type (ready[i]);
+/* The following function returns a free bundle state.  */
 
-  find_best_packet (&best, &best_packet, ready, sched_types, n_ready);
+static struct bundle_state *
+get_free_bundle_state ()
+{
+  struct bundle_state *result;
 
-  if (best == 0)
+  if (free_bundle_state_chain != NULL)
     {
-      if (may_fail)
-	return 0;
-      abort ();
+      result = free_bundle_state_chain;
+      free_bundle_state_chain = result->next;
     }
-
-  if (dump)
+  else
     {
-      fprintf (dump, "// Selected bundles: %s %s (%d insns)\n",
-	       best_packet->t1->name,
-	       best_packet->t2 ? best_packet->t2->name : NULL, best);
+      result = xmalloc (sizeof (struct bundle_state));
+      result->dfa_state = xmalloc (dfa_state_size);
+      result->allocated_states_chain = allocated_bundle_states_chain;
+      allocated_bundle_states_chain = result;
     }
+  result->unique_num = bundle_states_num++;
+  return result;
+  
+}
 
-  best_split = itanium_split_issue (best_packet, first);
-  packet_matches_p (best_packet, best_split, &filled);
+/* The following function frees given bundle state.  */
 
-  for (i = filled; i < best_split; i++)
-    {
-      int insn_nr;
+static void
+free_bundle_state (state)
+     struct bundle_state *state;
+{
+  state->next = free_bundle_state_chain;
+  free_bundle_state_chain = state;
+}
 
-      insn_nr = find_best_insn (ready, sched_types, n_ready, best_packet, i);
-      if (insn_nr >= 0)
-	{
-	  rtx insn = ready[insn_nr];
-	  memmove (ready + insn_nr, ready + insn_nr + 1,
-		   (n_ready - insn_nr - 1) * sizeof (rtx));
-	  memmove (sched_types + insn_nr, sched_types + insn_nr + 1,
-		   (n_ready - insn_nr - 1) * sizeof (enum attr_type));
-	  ready[--n_ready] = insn;
-	}
-    }
+/* Start work with abstract data `bundle states'.  */
 
-  sched_data.packet = best_packet;
-  sched_data.split = best_split;
-  return 1;
+static void
+initiate_bundle_states ()
+{
+  bundle_states_num = 0;
+  free_bundle_state_chain = NULL;
+  allocated_bundle_states_chain = NULL;
 }
 
-/* Dump information about the current scheduling state to file DUMP.  */
+/* Finish work with abstract data `bundle states'.  */
 
 static void
-dump_current_packet (dump)
-     FILE *dump;
+finish_bundle_states ()
 {
-  int i;
-  fprintf (dump, "//    %d slots filled:", sched_data.cur);
-  for (i = 0; i < sched_data.first_slot; i++)
-    {
-      rtx insn = sched_data.insns[i];
-      fprintf (dump, " %s", type_names[sched_data.types[i]]);
-      if (insn)
-	fprintf (dump, "/%s", type_names[ia64_safe_type (insn)]);
-      if (sched_data.stopbit[i])
-	fprintf (dump, " ;;");
-    }
-  fprintf (dump, " :::");
-  for (i = sched_data.first_slot; i < sched_data.cur; i++)
+  struct bundle_state *curr_state, *next_state;
+
+  for (curr_state = allocated_bundle_states_chain;
+       curr_state != NULL;
+       curr_state = next_state)
     {
-      rtx insn = sched_data.insns[i];
-      enum attr_type t = ia64_safe_type (insn);
-      fprintf (dump, " (%d) %s", INSN_UID (insn), type_names[t]);
+      next_state = curr_state->allocated_states_chain;
+      free (curr_state->dfa_state);
+      free (curr_state);
     }
-  fprintf (dump, "\n");
 }
 
-/* Schedule a stop bit.  DUMP is the current scheduling dump file, or
-   NULL.  */
+/* Hash table of the bundle states.  The key is dfa_state and insn_num
+   of the bundle states.  */
 
-static void
-schedule_stop (dump)
-     FILE *dump;
-{
-  const struct ia64_packet *best = sched_data.packet;
-  int i;
-  int best_stop = 6;
+static htab_t bundle_state_table;
 
-  if (dump)
-    fprintf (dump, "// Stop bit, cur = %d.\n", sched_data.cur);
+/* The function returns hash of BUNDLE_STATE.  */
 
-  if (sched_data.cur == 0)
-    {
-      if (dump)
-	fprintf (dump, "//   At start of bundle, so nothing to do.\n");
-
-      rotate_two_bundles (NULL);
-      return;
-    }
+static unsigned
+bundle_state_hash (bundle_state)
+     const void *bundle_state;
+{
+  const struct bundle_state *state = (struct bundle_state *) bundle_state;
+  unsigned result, i;
 
-  for (i = -1; i < NR_PACKETS; i++)
-    {
-      /* This is a slight hack to give the current packet the first chance.
-	 This is done to avoid e.g. switching from MIB to MBB bundles.  */
-      const struct ia64_packet *p = (i >= 0 ? packets + i : sched_data.packet);
-      int split = get_split (p, sched_data.first_slot);
-      const struct bundle *compare;
-      int next, stoppos;
+  for (result = i = 0; i < dfa_state_size; i++)
+    result += (((unsigned char *) state->dfa_state) [i]
+	       << ((i % CHAR_BIT) * 3 + CHAR_BIT));
+  return result + state->insn_num;
+}
 
-      if (! packet_matches_p (p, split, &next))
-	continue;
+/* The function returns nonzero if the bundle state keys are equal.  */
 
-      compare = next > 3 ? p->t2 : p->t1;
+static int
+bundle_state_eq_p (bundle_state_1, bundle_state_2)
+     const void *bundle_state_1;
+     const void *bundle_state_2;
+{
+  const struct bundle_state * state1 = (struct bundle_state *) bundle_state_1;
+  const struct bundle_state * state2 = (struct bundle_state *) bundle_state_2;
 
-      stoppos = 3;
-      if (compare->possible_stop)
-	stoppos = compare->possible_stop;
-      if (next > 3)
-	stoppos += 3;
+  return (state1->insn_num == state2->insn_num
+	  && memcmp (state1->dfa_state, state2->dfa_state,
+		     dfa_state_size) == 0);
+}
 
-      if (stoppos < next || stoppos >= best_stop)
-	{
-	  if (compare->possible_stop == 0)
-	    continue;
-	  stoppos = (next > 3 ? 6 : 3);
-	}
-      if (stoppos < next || stoppos >= best_stop)
-	continue;
+/* The function inserts the BUNDLE_STATE into the hash table.  The
+   function returns nonzero if the bundle has been inserted into the
+   table.  The table contains the best bundle state with given key.  */
 
-      if (dump)
-	fprintf (dump, "//   switching from %s %s to %s %s (stop at %d)\n",
-		 best->t1->name, best->t2->name, p->t1->name, p->t2->name,
-		 stoppos);
+static int
+insert_bundle_state (bundle_state)
+     struct bundle_state *bundle_state;
+{
+  void **entry_ptr;
 
-      best_stop = stoppos;
-      best = p;
+  entry_ptr = htab_find_slot (bundle_state_table, bundle_state, 1);
+  if (*entry_ptr == NULL)
+    {
+      bundle_state->next = index_to_bundle_states [bundle_state->insn_num];
+      index_to_bundle_states [bundle_state->insn_num] = bundle_state;
+      *entry_ptr = (void *) bundle_state;
+      return TRUE;
     }
-
-  sched_data.packet = best;
-  cycle_end_fill_slots (dump);
-  while (sched_data.cur < best_stop)
+  else if (bundle_state->cost < ((struct bundle_state *) *entry_ptr)->cost
+	   || (bundle_state->cost == ((struct bundle_state *) *entry_ptr)->cost
+	       && (((struct bundle_state *)*entry_ptr)->accumulated_insns_num
+		   > bundle_state->accumulated_insns_num
+		   || (((struct bundle_state *)
+			*entry_ptr)->accumulated_insns_num
+		       == bundle_state->accumulated_insns_num
+		       && ((struct bundle_state *)
+			   *entry_ptr)->branch_deviation
+		       > bundle_state->branch_deviation))))
+		   
     {
-      sched_data.types[sched_data.cur] = best->t[sched_data.cur];
-      sched_data.insns[sched_data.cur] = 0;
-      sched_data.stopbit[sched_data.cur] = 0;
-      sched_data.cur++;
+      struct bundle_state temp;
+
+      temp = *(struct bundle_state *) *entry_ptr;
+      *(struct bundle_state *) *entry_ptr = *bundle_state;
+      ((struct bundle_state *) *entry_ptr)->next = temp.next;
+      *bundle_state = temp;
     }
-  sched_data.stopbit[sched_data.cur - 1] = 1;
-  sched_data.first_slot = best_stop;
+  return FALSE;
+}
+
+/* Start work with the hash table.  */
 
-  if (dump)
-    dump_current_packet (dump);
+static void
+initiate_bundle_state_table ()
+{
+  bundle_state_table = htab_create (50, bundle_state_hash, bundle_state_eq_p,
+				    (htab_del) 0);
 }
 
-/* If necessary, perform one or two rotations on the scheduling state.  
-   This should only be called if we are starting a new cycle.  */
+/* Finish work with the hash table.  */
 
 static void
-maybe_rotate (dump)
-     FILE *dump;
+finish_bundle_state_table ()
 {
-  cycle_end_fill_slots (dump);
-  if (sched_data.cur == 6)
-    rotate_two_bundles (dump);
-  else if (sched_data.cur >= 3)
-    rotate_one_bundle (dump);
-  sched_data.first_slot = sched_data.cur;
+  htab_delete (bundle_state_table);
 }
 
-/* The clock cycle when ia64_sched_reorder was last called.  */
-static int prev_cycle;
+
 
-/* The first insn scheduled in the previous cycle.  This is the saved
-   value of sched_data.first_slot.  */
-static int prev_first;
+/* The following variable is a insn `nop' used to check bundle states
+   with different number of inserted nops.  */
 
-/* Emit NOPs to fill the delay between PREV_CYCLE and CLOCK_VAR.  Used to
-   pad out the delay between MM (shifts, etc.) and integer operations.  */
+static rtx ia64_nop;
 
-static void
-nop_cycles_until (clock_var, dump)
-     int clock_var;
-     FILE *dump;
+/* The following function tries to issue NOPS_NUM nops for the current
+   state without advancing processor cycle.  If it failed, the
+   function returns FALSE and frees the current state.  */
+
+static int
+try_issue_nops (curr_state, nops_num)
+     struct bundle_state *curr_state;
+     int nops_num;
 {
-  int prev_clock = prev_cycle;
-  int cycles_left = clock_var - prev_clock;
-  bool did_stop = false;
+  int i;
 
-  /* Finish the previous cycle; pad it out with NOPs.  */
-  if (sched_data.cur == 3)
-    {
-      sched_emit_insn (gen_insn_group_barrier (GEN_INT (3)));
-      did_stop = true;
-      maybe_rotate (dump);
-    }
-  else if (sched_data.cur > 0)
-    {
-      int need_stop = 0;
-      int split = itanium_split_issue (sched_data.packet, prev_first);
+  for (i = 0; i < nops_num; i++)
+    if (state_transition (curr_state->dfa_state, ia64_nop) >= 0)
+      {
+	free_bundle_state (curr_state);
+	return FALSE;
+      }
+  return TRUE;
+}
 
-      if (sched_data.cur < 3 && split > 3)
-	{
-	  split = 3;
-	  need_stop = 1;
-	}
+/* The following function tries to issue INSN for the current
+   state without advancing processor cycle.  If it failed, the
+   function returns FALSE and frees the current state.  */
 
-      if (split > sched_data.cur)
-	{
-	  int i;
-	  for (i = sched_data.cur; i < split; i++)
-	    {
-	      rtx t = sched_emit_insn (gen_nop_type (sched_data.packet->t[i]));
-	      sched_data.types[i] = sched_data.packet->t[i];
-	      sched_data.insns[i] = t;
-	      sched_data.stopbit[i] = 0;
-	    }
-	  sched_data.cur = split;
-	}
+static int
+try_issue_insn (curr_state, insn)
+     struct bundle_state *curr_state;
+     rtx insn;
+{
+  if (insn && state_transition (curr_state->dfa_state, insn) >= 0)
+    {
+      free_bundle_state (curr_state);
+      return FALSE;
+    }
+  return TRUE;
+}
 
-      if (! need_stop && sched_data.cur > 0 && sched_data.cur < 6
-	  && cycles_left > 1)
-	{
-	  int i;
-	  for (i = sched_data.cur; i < 6; i++)
-	    {
-	      rtx t = sched_emit_insn (gen_nop_type (sched_data.packet->t[i]));
-	      sched_data.types[i] = sched_data.packet->t[i];
-	      sched_data.insns[i] = t;
-	      sched_data.stopbit[i] = 0;
-	    }
-	  sched_data.cur = 6;
-	  cycles_left--;
-	  need_stop = 1;
-	}
+/* The following function tries to issue BEFORE_NOPS_NUM nops and INSN
+   starting with ORIGINATOR without advancing processor cycle.  If
+   TRY_BUNDLE_END_P is TRUE, the function also tries to issue nops to
+   fill all bundle. If it was successful, the function creates new
+   bundle state and insert into the hash table and into
+   `index_to_bundle_states'.  */
 
-      if (need_stop || sched_data.cur == 6)
+static void
+issue_nops_and_insn (originator, before_nops_num, insn, try_bundle_end_p)
+     struct bundle_state *originator;
+     int before_nops_num;
+     rtx insn;
+     int try_bundle_end_p;
+{
+  struct bundle_state *curr_state;
+
+  curr_state = get_free_bundle_state ();
+  memcpy (curr_state->dfa_state, originator->dfa_state, dfa_state_size);
+  curr_state->insn = insn;
+  curr_state->insn_num = originator->insn_num + 1;
+  curr_state->cost = originator->cost;
+  curr_state->originator = originator;
+  curr_state->before_nops_num = before_nops_num;
+  curr_state->after_nops_num = 0;
+  curr_state->accumulated_insns_num
+    = originator->accumulated_insns_num + before_nops_num;
+  curr_state->branch_deviation = originator->branch_deviation;
+  if (insn == NULL_RTX)
+    abort ();
+  else if (INSN_CODE (insn) == CODE_FOR_insn_group_barrier)
+    {
+      if (GET_MODE (insn) == TImode)
+	abort ();
+      if (!try_issue_nops (curr_state, before_nops_num))
+	return;
+      if (!try_issue_insn (curr_state, insn))
+	return;
+      memcpy (temp_dfa_state, curr_state->dfa_state, dfa_state_size);
+      if (state_transition (temp_dfa_state, dfa_pre_cycle_insn) >= 0
+	  && curr_state->accumulated_insns_num % 3 != 0)
 	{
-	  sched_emit_insn (gen_insn_group_barrier (GEN_INT (3)));
-	  did_stop = true;
+	  free_bundle_state (curr_state);
+	  return;
 	}
-      maybe_rotate (dump);
     }
-
-  cycles_left--;
-  while (cycles_left > 0)
+  else if (GET_MODE (insn) != TImode)
     {
-      sched_emit_insn (gen_bundle_selector (GEN_INT (0)));
-      sched_emit_insn (gen_nop_type (TYPE_M));
-      sched_emit_insn (gen_nop_type (TYPE_I));
-      if (cycles_left > 1)
+      if (!try_issue_nops (curr_state, before_nops_num))
+	return;
+      if (!try_issue_insn (curr_state, insn))
+	return;
+      if (GET_CODE (PATTERN (insn)) != ASM_INPUT
+	  && asm_noperands (PATTERN (insn)) < 0)
+	curr_state->accumulated_insns_num++;
+      if (ia64_safe_type (insn) == TYPE_L)
+	curr_state->accumulated_insns_num++;
+    }
+  else
+    {
+      state_transition (curr_state->dfa_state, dfa_pre_cycle_insn);
+      state_transition (curr_state->dfa_state, NULL);
+      curr_state->cost++;
+      if (!try_issue_nops (curr_state, before_nops_num))
+	return;
+      if (!try_issue_insn (curr_state, insn))
+	return;
+      if (GET_CODE (PATTERN (insn)) != ASM_INPUT
+	  && asm_noperands (PATTERN (insn)) < 0)
+	curr_state->accumulated_insns_num++;
+      if (ia64_safe_type (insn) == TYPE_L)
+	curr_state->accumulated_insns_num++;
+    }
+  if (ia64_safe_type (insn) == TYPE_B)
+    curr_state->branch_deviation
+      += 2 - (curr_state->accumulated_insns_num - 1) % 3;
+  if (try_bundle_end_p && curr_state->accumulated_insns_num % 3 != 0)
+    {
+      if (insert_bundle_state (curr_state))
 	{
-	  sched_emit_insn (gen_insn_group_barrier (GEN_INT (2)));
-	  cycles_left--;
+	  state_t dfa_state;
+	  struct bundle_state *curr_state1;
+	  struct bundle_state *allocated_states_chain;
+
+	  curr_state1 = get_free_bundle_state ();
+	  dfa_state = curr_state1->dfa_state;
+	  allocated_states_chain = curr_state1->allocated_states_chain;
+	  *curr_state1 = *curr_state;
+	  curr_state1->dfa_state = dfa_state;
+	  curr_state1->allocated_states_chain = allocated_states_chain;
+	  memcpy (curr_state1->dfa_state, curr_state->dfa_state,
+		  dfa_state_size);
+	  curr_state = curr_state1;
 	}
-      sched_emit_insn (gen_nop_type (TYPE_I));
-      sched_emit_insn (gen_insn_group_barrier (GEN_INT (3)));
-      did_stop = true;
-      cycles_left--;
+      if (!try_issue_nops (curr_state,
+			   3 - curr_state->accumulated_insns_num % 3))
+	return;
+      curr_state->after_nops_num
+	= 3 - curr_state->accumulated_insns_num % 3;
+      curr_state->accumulated_insns_num
+	+= 3 - curr_state->accumulated_insns_num % 3;
     }
+  if (!insert_bundle_state (curr_state))
+    free_bundle_state (curr_state);
+  return;
+}
+
+/* The following function returns position in the two window bundle
+   for given STATE.  */
 
-  if (did_stop)
-    init_insn_group_barriers ();
+static int
+get_max_pos (state)
+     state_t state;
+{
+  if (cpu_unit_reservation_p (state, pos_6))
+    return 6;
+  else if (cpu_unit_reservation_p (state, pos_5))
+    return 5;
+  else if (cpu_unit_reservation_p (state, pos_4))
+    return 4;
+  else if (cpu_unit_reservation_p (state, pos_3))
+    return 3;
+  else if (cpu_unit_reservation_p (state, pos_2))
+    return 2;
+  else if (cpu_unit_reservation_p (state, pos_1))
+    return 1;
+  else
+    return 0;
 }
 
-/* We are about to being issuing insns for this clock cycle.
-   Override the default sort algorithm to better slot instructions.  */
+/* The function returns code of a possible template for given position
+   and state.  The function should be called only with 2 values of
+   position equal to 3 or 6.  */
 
 static int
-ia64_internal_sched_reorder (dump, sched_verbose, ready, pn_ready,
-		    reorder_type, clock_var)
-     FILE *dump ATTRIBUTE_UNUSED;
-     int sched_verbose ATTRIBUTE_UNUSED;
-     rtx *ready;
-     int *pn_ready;
-     int reorder_type, clock_var;
+get_template (state, pos)
+     state_t state;
+     int pos;
 {
-  int n_asms;
-  int n_ready = *pn_ready;
-  rtx *e_ready = ready + n_ready;
-  rtx *insnp;
-
-  if (sched_verbose)
+  switch (pos)
     {
-      fprintf (dump, "// ia64_sched_reorder (type %d):\n", reorder_type);
-      dump_current_packet (dump);
+    case 3:
+      if (cpu_unit_reservation_p (state, _0mii_))
+	return 0;
+      else if (cpu_unit_reservation_p (state, _0mmi_))
+	return 1;
+      else if (cpu_unit_reservation_p (state, _0mfi_))
+	return 2;
+      else if (cpu_unit_reservation_p (state, _0mmf_))
+	return 3;
+      else if (cpu_unit_reservation_p (state, _0bbb_))
+	return 4;
+      else if (cpu_unit_reservation_p (state, _0mbb_))
+	return 5;
+      else if (cpu_unit_reservation_p (state, _0mib_))
+	return 6;
+      else if (cpu_unit_reservation_p (state, _0mmb_))
+	return 7;
+      else if (cpu_unit_reservation_p (state, _0mfb_))
+	return 8;
+      else if (cpu_unit_reservation_p (state, _0mlx_))
+	return 9;
+      else
+	abort ();
+    case 6:
+      if (cpu_unit_reservation_p (state, _1mii_))
+	return 0;
+      else if (cpu_unit_reservation_p (state, _1mmi_))
+	return 1;
+      else if (cpu_unit_reservation_p (state, _1mfi_))
+	return 2;
+      else if (_1mmf_ >= 0 && cpu_unit_reservation_p (state, _1mmf_))
+	return 3;
+      else if (cpu_unit_reservation_p (state, _1bbb_))
+	return 4;
+      else if (cpu_unit_reservation_p (state, _1mbb_))
+	return 5;
+      else if (cpu_unit_reservation_p (state, _1mib_))
+	return 6;
+      else if (cpu_unit_reservation_p (state, _1mmb_))
+	return 7;
+      else if (cpu_unit_reservation_p (state, _1mfb_))
+	return 8;
+      else if (cpu_unit_reservation_p (state, _1mlx_))
+	return 9;
+      else
+	abort ();
+    default:
+      abort ();
     }
+}
 
-  /* Work around the pipeline flush that will occurr if the results of
-     an MM instruction are accessed before the result is ready.  Intel
-     documentation says this only happens with IALU, ISHF, ILOG, LD,
-     and ST consumers, but experimental evidence shows that *any* non-MM
-     type instruction will incurr the flush.  */
-  if (reorder_type == 0 && clock_var > 0 && ia64_final_schedule)
-    {
-      for (insnp = ready; insnp < e_ready; insnp++)
-	{
-	  rtx insn = *insnp, link;
-	  enum attr_itanium_class t = ia64_safe_itanium_class (insn);
-
-	  if (t == ITANIUM_CLASS_MMMUL
-	      || t == ITANIUM_CLASS_MMSHF
-	      || t == ITANIUM_CLASS_MMSHFI)
-	    continue;
+/* The following function returns an insn important for insn bundling
+   followed by INSN and before TAIL.  */
 
-	  for (link = LOG_LINKS (insn); link; link = XEXP (link, 1))
-	    if (REG_NOTE_KIND (link) == 0)
-	      {
-		rtx other = XEXP (link, 0);
-		enum attr_itanium_class t0 = ia64_safe_itanium_class (other);
-		if (t0 == ITANIUM_CLASS_MMSHF || t0 == ITANIUM_CLASS_MMMUL)
-		  {
-		    nop_cycles_until (clock_var, sched_verbose ? dump : NULL);
-		    goto out;
-		  }
-	      }
-	}
-    }
- out:
+static rtx
+get_next_important_insn (insn, tail)
+     rtx insn, tail;
+{
+  for (; insn && insn != tail; insn = NEXT_INSN (insn))
+    if (INSN_P (insn)
+	&& ia64_safe_itanium_class (insn) != ITANIUM_CLASS_IGNORE
+	&& GET_CODE (PATTERN (insn)) != USE
+	&& GET_CODE (PATTERN (insn)) != CLOBBER)
+      return insn;
+  return NULL_RTX;
+}
 
-  prev_first = sched_data.first_slot;
-  prev_cycle = clock_var;
+/* The following function does insn bundling.  Bundling algorithm is
+   based on dynamic programming.  It tries to insert different number of
+   nop insns before/after the real insns.  At the end of EBB, it chooses the
+   best alternative and then, moving back in EBB, inserts templates for
+   the best alternative.  The algorithm is directed by information
+   (changes of simulated processor cycle) created by the 2nd insn
+   scheduling.  */
 
-  if (reorder_type == 0)
-    maybe_rotate (sched_verbose ? dump : NULL);
+static void
+bundling (dump, verbose, prev_head_insn, tail)
+     FILE *dump;
+     int verbose;
+     rtx prev_head_insn, tail;
+{
+  struct bundle_state *curr_state, *next_state, *best_state;
+  rtx insn, next_insn;
+  int insn_num;
+  int i, bundle_end_p;
+  int pos, max_pos, template0, template1;
+  rtx b;
+  rtx nop;
+  enum attr_type type;
 
-  /* First, move all USEs, CLOBBERs and other crud out of the way.  */
-  n_asms = 0;
-  for (insnp = ready; insnp < e_ready; insnp++)
-    if (insnp < e_ready)
+  insn_num = 0;
+  for (insn = NEXT_INSN (prev_head_insn);
+       insn && insn != tail;
+       insn = NEXT_INSN (insn))
+    if (INSN_P (insn))
+      insn_num++;
+  if (insn_num == 0)
+    return;
+  bundling_p = 1;
+  dfa_clean_insn_cache ();
+  initiate_bundle_state_table ();
+  index_to_bundle_states = xmalloc ((insn_num + 2)
+				    * sizeof (struct bundle_state *));
+  /* First (forward) pass -- generates states. */
+  curr_state = get_free_bundle_state ();
+  curr_state->insn = NULL;
+  curr_state->before_nops_num = 0;
+  curr_state->after_nops_num = 0;
+  curr_state->insn_num = 0;
+  curr_state->cost = 0;
+  curr_state->accumulated_insns_num = 0;
+  curr_state->branch_deviation = 0;
+  curr_state->next = NULL;
+  curr_state->originator = NULL;
+  state_reset (curr_state->dfa_state);
+  index_to_bundle_states [0] = curr_state;
+  insn_num = 0;
+  for (insn = NEXT_INSN (prev_head_insn);
+       insn != tail;
+       insn = NEXT_INSN (insn))
+    if (INSN_P (insn)
+	&& (ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE
+	    || GET_CODE (PATTERN (insn)) == USE
+	    || GET_CODE (PATTERN (insn)) == CLOBBER)
+	&& GET_MODE (insn) == TImode)
       {
-	rtx insn = *insnp;
-	enum attr_type t = ia64_safe_type (insn);
-	if (t == TYPE_UNKNOWN)
-	  {
-	    if (GET_CODE (PATTERN (insn)) == ASM_INPUT
-		|| asm_noperands (PATTERN (insn)) >= 0)
-	      {
-		rtx lowest = ready[n_asms];
-		ready[n_asms] = insn;
-		*insnp = lowest;
-		n_asms++;
-	      }
-	    else
-	      {
-		rtx highest = ready[n_ready - 1];
-		ready[n_ready - 1] = insn;
-		*insnp = highest;
-		if (ia64_final_schedule && group_barrier_needed_p (insn))
-		  {
-		    schedule_stop (sched_verbose ? dump : NULL);
-		    sched_data.last_was_stop = 1;
-		    maybe_rotate (sched_verbose ? dump : NULL);
-		  }
-
-		return 1;
-	      }
-	  }
+	PUT_MODE (insn, VOIDmode);
+	for (next_insn = NEXT_INSN (insn);
+	     next_insn != tail;
+	     next_insn = NEXT_INSN (next_insn))
+	  if (INSN_P (next_insn)
+	      && ia64_safe_itanium_class (next_insn) != ITANIUM_CLASS_IGNORE
+	      && GET_CODE (PATTERN (next_insn)) != USE
+	      && GET_CODE (PATTERN (next_insn)) != CLOBBER)
+	    {
+	      PUT_MODE (next_insn, TImode);
+	      break;
+	    }
       }
-  if (n_asms < n_ready)
+  for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail);
+       insn != NULL_RTX;
+       insn = next_insn)
     {
-      /* Some normal insns to process.  Skip the asms.  */
-      ready += n_asms;
-      n_ready -= n_asms;
-    }
-  else if (n_ready > 0)
-    {
-      /* Only asm insns left.  */
-      if (ia64_final_schedule && group_barrier_needed_p (ready[n_ready - 1]))
+      if (!INSN_P (insn)
+	  || ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE
+	  || GET_CODE (PATTERN (insn)) == USE
+	  || GET_CODE (PATTERN (insn)) == CLOBBER)
+	abort ();
+      next_insn = get_next_important_insn (NEXT_INSN (insn), tail);
+      insn_num++;
+      index_to_bundle_states [insn_num] = NULL;
+      for (curr_state = index_to_bundle_states [insn_num - 1];
+	   curr_state != NULL;
+	   curr_state = next_state)
 	{
-	  schedule_stop (sched_verbose ? dump : NULL);
-	  sched_data.last_was_stop = 1;
-	  maybe_rotate (sched_verbose ? dump : NULL);
+	  pos = curr_state->accumulated_insns_num % 3;
+	  type = ia64_safe_type (insn);
+	  next_state = curr_state->next;
+	  bundle_end_p
+	    = (next_insn == NULL_RTX
+	       || (GET_MODE (next_insn) == TImode
+		   && INSN_CODE (insn) != CODE_FOR_insn_group_barrier));
+	  if (type == TYPE_F || type == TYPE_B || type == TYPE_L
+	      || type == TYPE_S
+	      /* We need to insert 2 Nops for cases like M_MII.  */
+	      || (type == TYPE_M && ia64_tune == PROCESSOR_ITANIUM
+		  && !bundle_end_p && pos == 1))
+	    issue_nops_and_insn (curr_state, 2, insn, bundle_end_p);
+	  issue_nops_and_insn (curr_state, 1, insn, bundle_end_p);
+	  issue_nops_and_insn (curr_state, 0, insn, bundle_end_p);
 	}
-      cycle_end_fill_slots (sched_verbose ? dump : NULL);
-      return 1;
+      if (index_to_bundle_states [insn_num] == NULL)
+	abort ();
+      for (curr_state = index_to_bundle_states [insn_num];
+	   curr_state != NULL;
+	   curr_state = curr_state->next)
+	if (verbose >= 2 && dump)
+	  {
+	    struct DFA_chip
+	    {
+	      unsigned short one_automaton_state;
+	      unsigned short oneb_automaton_state;
+	      unsigned short two_automaton_state;
+	      unsigned short twob_automaton_state;
+	    };
+	    
+	    fprintf
+	      (dump,
+	       "//    Bundle state %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, state %d) for %d\n",
+	       curr_state->unique_num,
+	       (curr_state->originator == NULL
+		? -1 : curr_state->originator->unique_num),
+	       curr_state->cost,
+	       curr_state->before_nops_num, curr_state->after_nops_num,
+	       curr_state->accumulated_insns_num, curr_state->branch_deviation,
+	       (ia64_tune == PROCESSOR_ITANIUM
+		? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state
+		: ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state),
+	       INSN_UID (insn));
+	  }
     }
-
-  if (ia64_final_schedule)
-    {
-      int nr_need_stop = 0;
-
-      for (insnp = ready; insnp < e_ready; insnp++)
-	if (safe_group_barrier_needed_p (*insnp))
-	  nr_need_stop++;
-
-      /* Schedule a stop bit if
-          - all insns require a stop bit, or
-          - we are starting a new cycle and _any_ insns require a stop bit.
-         The reason for the latter is that if our schedule is accurate, then
-         the additional stop won't decrease performance at this point (since
-	 there's a split issue at this point anyway), but it gives us more
-         freedom when scheduling the currently ready insns.  */
-      if ((reorder_type == 0 && nr_need_stop)
-	  || (reorder_type == 1 && n_ready == nr_need_stop))
+  if (index_to_bundle_states [insn_num] == NULL)
+    abort ();
+  /* Finding state with a minimal cost:  */
+  best_state = NULL;
+  for (curr_state = index_to_bundle_states [insn_num];
+       curr_state != NULL;
+       curr_state = curr_state->next)
+    if (curr_state->accumulated_insns_num % 3 == 0
+	&& (best_state == NULL || best_state->cost > curr_state->cost
+	    || (best_state->cost == curr_state->cost
+		&& (curr_state->accumulated_insns_num
+		    < best_state->accumulated_insns_num
+		    || (curr_state->accumulated_insns_num
+			== best_state->accumulated_insns_num
+			&& curr_state->branch_deviation
+			< best_state->branch_deviation)))))
+      best_state = curr_state;
+  /* Second (backward) pass: adding nops and templates:  */
+  insn_num = best_state->before_nops_num;
+  template0 = template1 = -1;
+  for (curr_state = best_state;
+       curr_state->originator != NULL;
+       curr_state = curr_state->originator)
+    {
+      insn = curr_state->insn;
+      insn_num++;
+      if (verbose >= 2 && dump)
 	{
-	  schedule_stop (sched_verbose ? dump : NULL);
-	  sched_data.last_was_stop = 1;
-	  maybe_rotate (sched_verbose ? dump : NULL);
-	  if (reorder_type == 1)
-	    return 0;
+	  struct DFA_chip
+	  {
+	    unsigned short one_automaton_state;
+	    unsigned short oneb_automaton_state;
+	    unsigned short two_automaton_state;
+	    unsigned short twob_automaton_state;
+	  };
+	  
+	  fprintf
+	    (dump,
+	     "//    Best %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, state %d) for %d\n",
+	     curr_state->unique_num,
+	     (curr_state->originator == NULL
+	      ? -1 : curr_state->originator->unique_num),
+	     curr_state->cost,
+	     curr_state->before_nops_num, curr_state->after_nops_num,
+	     curr_state->accumulated_insns_num, curr_state->branch_deviation,
+	     (ia64_tune == PROCESSOR_ITANIUM
+	      ? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state
+	      : ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state),
+	     INSN_UID (insn));
 	}
-      else
+      max_pos = get_max_pos (curr_state->dfa_state);
+      if (max_pos == 6 || (max_pos == 3 && template0 < 0))
 	{
-	  int deleted = 0;
-	  insnp = e_ready;
-	  /* Move down everything that needs a stop bit, preserving relative
-	     order.  */
-	  while (insnp-- > ready + deleted)
-	    while (insnp >= ready + deleted)
-	      {
-		rtx insn = *insnp;
-		if (! safe_group_barrier_needed_p (insn))
-		  break;
-		memmove (ready + 1, ready, (insnp - ready) * sizeof (rtx));
-		*ready = insn;
-		deleted++;
-	      }
-	  n_ready -= deleted;
-	  ready += deleted;
-	  if (deleted != nr_need_stop)
+	  pos = max_pos;
+	  if (max_pos == 3)
+	    template0 = get_template (curr_state->dfa_state, 3);
+	  else
+	    {
+	      template1 = get_template (curr_state->dfa_state, 3);
+	      template0 = get_template (curr_state->dfa_state, 6);
+	    }
+	}
+      if (max_pos > 3 && template1 < 0)
+	{
+	  if (pos > 3)
 	    abort ();
+	  template1 = get_template (curr_state->dfa_state, 3);
+	  pos += 3;
+	}
+      for (i = 0; i < curr_state->after_nops_num; i++)
+	{
+	  nop = gen_nop ();
+	  emit_insn_after (nop, insn);
+	  pos--;
+	  if (pos < 0)
+	    abort ();
+	  if (pos % 3 == 0)
+	    {
+	      if (template0 < 0)
+		abort ();
+	      b = gen_bundle_selector (GEN_INT (template0));
+	      ia64_emit_insn_before (b, nop);
+	      template0 = template1;
+	      template1 = -1;
+	    }
+	}
+      if (INSN_CODE (insn) != CODE_FOR_insn_group_barrier
+	  && GET_CODE (PATTERN (insn)) != ASM_INPUT
+	  && asm_noperands (PATTERN (insn)) < 0)
+	pos--;
+      if (ia64_safe_type (insn) == TYPE_L)
+	pos--;
+      if (pos < 0)
+	abort ();
+      if (pos % 3 == 0
+	  && INSN_CODE (insn) != CODE_FOR_insn_group_barrier
+	  && GET_CODE (PATTERN (insn)) != ASM_INPUT
+	  && asm_noperands (PATTERN (insn)) < 0)
+	{
+	  if (template0 < 0)
+	    abort ();
+	  b = gen_bundle_selector (GEN_INT (template0));
+	  ia64_emit_insn_before (b, insn);
+	  b = PREV_INSN (insn);
+	  insn = b;
+	  template0 = template1;
+	  template1 = -1;
+	}
+      for (i = 0; i < curr_state->before_nops_num; i++)
+	{
+	  nop = gen_nop ();
+	  ia64_emit_insn_before (nop, insn);
+	  nop = PREV_INSN (insn);
+	  insn = nop;
+	  pos--;
+	  if (pos < 0)
+	    abort ();
+	  if (pos % 3 == 0)
+	    {
+	      if (template0 < 0)
+		abort ();
+	      b = gen_bundle_selector (GEN_INT (template0));
+	      ia64_emit_insn_before (b, insn);
+	      b = PREV_INSN (insn);
+	      insn = b;
+	      template0 = template1;
+	      template1 = -1;
+	    }
 	}
     }
-
-  return itanium_reorder (sched_verbose ? dump : NULL,
-			  ready, e_ready, reorder_type == 1);
+  if (ia64_tune == PROCESSOR_ITANIUM)
+    /* Insert additional cycles for MM-insns: */
+    for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail);
+	 insn != NULL_RTX;
+	 insn = next_insn)
+      {
+	if (!INSN_P (insn)
+	    || ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE
+	    || GET_CODE (PATTERN (insn)) == USE
+	    || GET_CODE (PATTERN (insn)) == CLOBBER)
+	  abort ();
+	next_insn = get_next_important_insn (NEXT_INSN (insn), tail);
+	if (INSN_UID (insn) < clocks_length && add_cycles [INSN_UID (insn)])
+	  {
+	    rtx last;
+	    int i, j, n;
+	    int pred_stop_p;
+	    
+	    last = prev_active_insn (insn);
+	    pred_stop_p = recog_memoized (last) == CODE_FOR_insn_group_barrier;
+	    if (pred_stop_p)
+	      last = prev_active_insn (last);
+	    n = 0;
+	    for (;; last = prev_active_insn (last))
+	      if (recog_memoized (last) == CODE_FOR_bundle_selector)
+		{
+		  template0 = XINT (XVECEXP (PATTERN (last), 0, 0), 0);
+		  if (template0 == 9)
+		    PATTERN (last)
+		      = gen_bundle_selector (GEN_INT (2)); /* -> MFI */
+		  break;
+		}
+	      else if (recog_memoized (last) != CODE_FOR_insn_group_barrier)
+		n++;
+	    if ((pred_stop_p && n == 0) || n > 2
+		|| (template0 == 9 && n != 0))
+	      abort ();
+	    for (j = 3 - n; j > 0; j --)
+	      ia64_emit_insn_before (gen_nop (), insn);
+	    add_cycles [INSN_UID (insn)]--;
+	    if (!pred_stop_p || add_cycles [INSN_UID (insn)])
+	      ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+				     insn);
+	    if (pred_stop_p)
+	      add_cycles [INSN_UID (insn)]--;
+	    for (i = add_cycles [INSN_UID (insn)]; i > 0; i--)
+	      {
+		/* Insert .MII bundle.  */
+		ia64_emit_insn_before (gen_bundle_selector (GEN_INT (0)),
+				       insn);
+		ia64_emit_insn_before (gen_nop (), insn);
+		ia64_emit_insn_before (gen_nop (), insn);
+		if (i > 1)
+		  {
+		    ia64_emit_insn_before
+		      (gen_insn_group_barrier (GEN_INT (3)), insn);
+		    i--;
+		  }
+		ia64_emit_insn_before (gen_nop (), insn);
+		ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+				       insn);
+	      }
+	    ia64_emit_insn_before (gen_bundle_selector (GEN_INT (template0)),
+				   insn);
+	    for (j = n; j > 0; j --)
+	      ia64_emit_insn_before (gen_nop (), insn);
+	    if (pred_stop_p)
+	      ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+				     insn);
+	  }
+      }
+  free (index_to_bundle_states);
+  finish_bundle_state_table ();
+  bundling_p = 0;
+  dfa_clean_insn_cache ();
 }
 
-static int
-ia64_sched_reorder (dump, sched_verbose, ready, pn_ready, clock_var)
+/* The following function is called at the end of scheduling BB or
+   EBB.  After reload, it inserts stop bits and does insn bundling.  */
+
+static void
+ia64_sched_finish (dump, sched_verbose)
      FILE *dump;
      int sched_verbose;
-     rtx *ready;
-     int *pn_ready;
-     int clock_var;
 {
-  return ia64_internal_sched_reorder (dump, sched_verbose, ready,
-				      pn_ready, 0, clock_var);
+  if (sched_verbose)
+    fprintf (dump, "// Finishing schedule.\n");
+  if (!reload_completed)
+    return;
+  if (reload_completed)
+    {
+      final_emit_insn_group_barriers (dump);
+      bundling (dump, sched_verbose, current_sched_info->prev_head,
+		current_sched_info->next_tail);
+      if (sched_verbose && dump)
+	fprintf (dump, "//    finishing %d-%d\n",
+		 INSN_UID (NEXT_INSN (current_sched_info->prev_head)),
+		 INSN_UID (PREV_INSN (current_sched_info->next_tail)));
+      
+      return;
+    }
 }
 
-/* Like ia64_sched_reorder, but called after issuing each insn.
-   Override the default sort algorithm to better slot instructions.  */
+/* The following function inserts stop bits in scheduled BB or EBB.  */
 
-static int
-ia64_sched_reorder2 (dump, sched_verbose, ready, pn_ready, clock_var)
+static void
+final_emit_insn_group_barriers (dump)
      FILE *dump ATTRIBUTE_UNUSED;
-     int sched_verbose ATTRIBUTE_UNUSED;
-     rtx *ready;
-     int *pn_ready;
-     int clock_var;
 {
-  if (sched_data.last_was_stop)
-    return 0;
+  rtx insn;
+  int need_barrier_p = 0;
+  rtx prev_insn = NULL_RTX;
 
-  /* Detect one special case and try to optimize it.
-     If we have 1.M;;MI 2.MIx, and slots 2.1 (M) and 2.2 (I) are both NOPs,
-     then we can get better code by transforming this to 1.MFB;; 2.MIx.  */
-  if (sched_data.first_slot == 1
-      && sched_data.stopbit[0]
-      && ((sched_data.cur == 4
-	   && (sched_data.types[1] == TYPE_M || sched_data.types[1] == TYPE_A)
-	   && (sched_data.types[2] == TYPE_I || sched_data.types[2] == TYPE_A)
-	   && (sched_data.types[3] != TYPE_M && sched_data.types[3] != TYPE_A))
-	  || (sched_data.cur == 3
-	      && (sched_data.types[1] == TYPE_M
-		  || sched_data.types[1] == TYPE_A)
-	      && (sched_data.types[2] != TYPE_M
-		  && sched_data.types[2] != TYPE_I
-		  && sched_data.types[2] != TYPE_A))))
-      
-    {
-      int i, best;
-      rtx stop = sched_data.insns[1];
+  init_insn_group_barriers ();
 
-      /* Search backward for the stop bit that must be there.  */
-      while (1)
+  for (insn = NEXT_INSN (current_sched_info->prev_head);
+       insn != current_sched_info->next_tail;
+       insn = NEXT_INSN (insn))
+    {
+      if (GET_CODE (insn) == BARRIER)
 	{
-	  int insn_code;
-
-	  stop = PREV_INSN (stop);
-	  if (GET_CODE (stop) != INSN)
-	    abort ();
-	  insn_code = recog_memoized (stop);
-
-	  /* Ignore .pred.rel.mutex.
+	  rtx last = prev_active_insn (insn);
 
-	     ??? Update this to ignore cycle display notes too
-	     ??? once those are implemented  */
-	  if (insn_code == CODE_FOR_pred_rel_mutex
-	      || insn_code == CODE_FOR_prologue_use)
+	  if (! last)
 	    continue;
+	  if (GET_CODE (last) == JUMP_INSN
+	      && GET_CODE (PATTERN (last)) == ADDR_DIFF_VEC)
+	    last = prev_active_insn (last);
+	  if (recog_memoized (last) != CODE_FOR_insn_group_barrier)
+	    emit_insn_after (gen_insn_group_barrier (GEN_INT (3)), last);
 
-	  if (insn_code == CODE_FOR_insn_group_barrier)
-	    break;
-	  abort ();
+	  init_insn_group_barriers ();
+	  need_barrier_p = 0;
+	  prev_insn = NULL_RTX;
 	}
-
-      /* Adjust the stop bit's slot selector.  */
-      if (INTVAL (XVECEXP (PATTERN (stop), 0, 0)) != 1)
-	abort ();
-      XVECEXP (PATTERN (stop), 0, 0) = GEN_INT (3);
-
-      sched_data.stopbit[0] = 0;
-      sched_data.stopbit[2] = 1;
-
-      sched_data.types[5] = sched_data.types[3];
-      sched_data.types[4] = sched_data.types[2];
-      sched_data.types[3] = sched_data.types[1];
-      sched_data.insns[5] = sched_data.insns[3];
-      sched_data.insns[4] = sched_data.insns[2];
-      sched_data.insns[3] = sched_data.insns[1];
-      sched_data.stopbit[5] = sched_data.stopbit[4] = sched_data.stopbit[3] = 0;
-      sched_data.cur += 2;
-      sched_data.first_slot = 3;
-      for (i = 0; i < NR_PACKETS; i++)
+      else if (INSN_P (insn))
 	{
-	  const struct ia64_packet *p = packets + i;
-	  if (p->t[0] == TYPE_M && p->t[1] == TYPE_F && p->t[2] == TYPE_B)
+	  if (recog_memoized (insn) == CODE_FOR_insn_group_barrier)
 	    {
-	      sched_data.packet = p;
-	      break;
+	      init_insn_group_barriers ();
+	      need_barrier_p = 0;
+	      prev_insn = NULL_RTX;
 	    }
-	}
-      rotate_one_bundle (sched_verbose ? dump : NULL);
-
-      best = 6;
-      for (i = 0; i < NR_PACKETS; i++)
-	{
-	  const struct ia64_packet *p = packets + i;
-	  int split = get_split (p, sched_data.first_slot);
-	  int next;
-
-	  /* Disallow multiway branches here.  */
-	  if (p->t[1] == TYPE_B)
-	    continue;
-
-	  if (packet_matches_p (p, split, &next) && next < best)
+	  else if (need_barrier_p || group_barrier_needed_p (insn))
 	    {
-	      best = next;
-	      sched_data.packet = p;
-	      sched_data.split = split;
+	      if (TARGET_EARLY_STOP_BITS)
+		{
+		  rtx last;
+		  
+		  for (last = insn;
+		       last != current_sched_info->prev_head;
+		       last = PREV_INSN (last))
+		    if (INSN_P (last) && GET_MODE (last) == TImode
+			&& stops_p [INSN_UID (last)])
+		      break;
+		  if (last == current_sched_info->prev_head)
+		    last = insn;
+		  last = prev_active_insn (last);
+		  if (last
+		      && recog_memoized (last) != CODE_FOR_insn_group_barrier)
+		    emit_insn_after (gen_insn_group_barrier (GEN_INT (3)),
+				     last);
+		  init_insn_group_barriers ();
+		  for (last = NEXT_INSN (last);
+		       last != insn;
+		       last = NEXT_INSN (last))
+		    if (INSN_P (last))
+		      group_barrier_needed_p (last);
+		}
+	      else
+		{
+		  emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
+				    insn);
+		  init_insn_group_barriers ();
+		}
+	      group_barrier_needed_p (insn);
+	      prev_insn = NULL_RTX;
 	    }
+	  else if (recog_memoized (insn) >= 0)
+	    prev_insn = insn;
+	  need_barrier_p = (GET_CODE (insn) == CALL_INSN
+			    || GET_CODE (PATTERN (insn)) == ASM_INPUT
+			    || asm_noperands (PATTERN (insn)) >= 0);
 	}
-      if (best == 6)
-	abort ();
     }
+}
 
-  if (*pn_ready > 0)
-    {
-      int more = ia64_internal_sched_reorder (dump, sched_verbose,
-					      ready, pn_ready, 1,
-					      clock_var);
-      if (more)
-	return more;
-      /* Did we schedule a stop?  If so, finish this cycle.  */
-      if (sched_data.cur == sched_data.first_slot)
-	return 0;
-    }
+
 
-  if (sched_verbose)
-    fprintf (dump, "//   Can't issue more this cycle; updating type array.\n");
+/* If the following function returns TRUE, we will use the the DFA
+   insn scheduler.  */
 
-  cycle_end_fill_slots (sched_verbose ? dump : NULL);
-  if (sched_verbose)
-    dump_current_packet (dump);
-  return 0;
+static int
+ia64_use_dfa_pipeline_interface ()
+{
+  return 1;
 }
 
-/* We are about to issue INSN.  Return the number of insns left on the
-   ready queue that can be issued this cycle.  */
+/* If the following function returns TRUE, we will use the the DFA
+   insn scheduler.  */
 
 static int
-ia64_variable_issue (dump, sched_verbose, insn, can_issue_more)
-     FILE *dump;
-     int sched_verbose;
-     rtx insn;
-     int can_issue_more ATTRIBUTE_UNUSED;
+ia64_first_cycle_multipass_dfa_lookahead ()
 {
-  enum attr_type t = ia64_safe_type (insn);
+  return (reload_completed ? 6 : 4);
+}
 
-  if (sched_data.last_was_stop)
-    {
-      int t = sched_data.first_slot;
-      if (t == 0)
-	t = 3;
-      ia64_emit_insn_before (gen_insn_group_barrier (GEN_INT (t)), insn);
-      init_insn_group_barriers ();
-      sched_data.last_was_stop = 0;
-    }
+/* The following function initiates variable `dfa_pre_cycle_insn'.  */
 
-  if (t == TYPE_UNKNOWN)
+static void
+ia64_init_dfa_pre_cycle_insn ()
+{
+  if (temp_dfa_state == NULL)
     {
-      if (sched_verbose)
-	fprintf (dump, "// Ignoring type %s\n", type_names[t]);
-      if (GET_CODE (PATTERN (insn)) == ASM_INPUT
-	  || asm_noperands (PATTERN (insn)) >= 0)
-	{
-	  /* This must be some kind of asm.  Clear the scheduling state.  */
-	  rotate_two_bundles (sched_verbose ? dump : NULL);
-	  if (ia64_final_schedule)
-	    group_barrier_needed_p (insn);
-	}
-      return 1;
+      dfa_state_size = state_size ();
+      temp_dfa_state = xmalloc (dfa_state_size);
+      prev_cycle_state = xmalloc (dfa_state_size);
     }
+  dfa_pre_cycle_insn = make_insn_raw (gen_pre_cycle ());
+  PREV_INSN (dfa_pre_cycle_insn) = NEXT_INSN (dfa_pre_cycle_insn) = NULL_RTX;
+  recog_memoized (dfa_pre_cycle_insn);
+  dfa_stop_insn = make_insn_raw (gen_insn_group_barrier (GEN_INT (3)));
+  PREV_INSN (dfa_stop_insn) = NEXT_INSN (dfa_stop_insn) = NULL_RTX;
+  recog_memoized (dfa_stop_insn);
+}
 
-  /* This is _not_ just a sanity check.  group_barrier_needed_p will update
-     important state info.  Don't delete this test.  */
-  if (ia64_final_schedule
-      && group_barrier_needed_p (insn))
-    abort ();
+/* The following function returns the pseudo insn DFA_PRE_CYCLE_INSN
+   used by the DFA insn scheduler.  */
 
-  sched_data.stopbit[sched_data.cur] = 0;
-  sched_data.insns[sched_data.cur] = insn;
-  sched_data.types[sched_data.cur] = t;
+static rtx
+ia64_dfa_pre_cycle_insn ()
+{
+  return dfa_pre_cycle_insn;
+}
 
-  sched_data.cur++;
-  if (sched_verbose)
-    fprintf (dump, "// Scheduling insn %d of type %s\n",
-	     INSN_UID (insn), type_names[t]);
+/* The following function returns TRUE if PRODUCER (of type ilog or
+   ld) produces address for CONSUMER (of type st or stf). */
 
-  if (GET_CODE (insn) == CALL_INSN && ia64_final_schedule)
-    {
-      schedule_stop (sched_verbose ? dump : NULL);
-      sched_data.last_was_stop = 1;
-    }
+int
+ia64_st_address_bypass_p (producer, consumer)
+     rtx producer;
+     rtx consumer;
+{
+  rtx dest, reg, mem;
 
-  return 1;
+  if (producer == NULL_RTX || consumer == NULL_RTX)
+    abort ();
+  dest = ia64_single_set (producer);
+  if (dest == NULL_RTX || (reg = SET_DEST (dest)) == NULL_RTX
+      || (GET_CODE (reg) != REG && GET_CODE (reg) != SUBREG))
+    abort ();
+  if (GET_CODE (reg) == SUBREG)
+    reg = SUBREG_REG (reg);
+  dest = ia64_single_set (consumer);
+  if (dest == NULL_RTX || (mem = SET_DEST (dest)) == NULL_RTX
+      || GET_CODE (mem) != MEM)
+    abort ();
+  return reg_mentioned_p (reg, mem);
 }
 
-/* Free data allocated by ia64_sched_init.  */
+/* The following function returns TRUE if PRODUCER (of type ilog or
+   ld) produces address for CONSUMER (of type ld or fld). */
 
-static void
-ia64_sched_finish (dump, sched_verbose)
-     FILE *dump;
-     int sched_verbose;
+int
+ia64_ld_address_bypass_p (producer, consumer)
+     rtx producer;
+     rtx consumer;
 {
-  if (sched_verbose)
-    fprintf (dump, "// Finishing schedule.\n");
-  rotate_two_bundles (NULL);
-  free (sched_types);
-  free (sched_ready);
+  rtx dest, src, reg, mem;
+
+  if (producer == NULL_RTX || consumer == NULL_RTX)
+    abort ();
+  dest = ia64_single_set (producer);
+  if (dest == NULL_RTX || (reg = SET_DEST (dest)) == NULL_RTX
+      || (GET_CODE (reg) != REG && GET_CODE (reg) != SUBREG))
+    abort ();
+  if (GET_CODE (reg) == SUBREG)
+    reg = SUBREG_REG (reg);
+  src = ia64_single_set (consumer);
+  if (src == NULL_RTX || (mem = SET_SRC (src)) == NULL_RTX)
+    abort ();
+  if (GET_CODE (mem) == UNSPEC && XVECLEN (mem, 0) > 0)
+    mem = XVECEXP (mem, 0, 0);
+  while (GET_CODE (mem) == SUBREG || GET_CODE (mem) == ZERO_EXTEND)
+    mem = XEXP (mem, 0);
+  if (GET_CODE (mem) != MEM)
+    abort ();
+  return reg_mentioned_p (reg, mem);
+}
+
+/* The following function returns TRUE if INSN produces address for a
+   load/store insn.  We will place such insns into M slot because it
+   decreases its latency time. */
+
+int
+ia64_produce_address_p (insn)
+     rtx insn;
+{
+  return insn->call;
 }
+
 
 /* Emit pseudo-ops for the assembler to describe predicate relations.
    At present this assumes that we only consider predicate pairs to
@@ -6887,111 +7006,6 @@ emit_predicate_relation_info ()
     }
 }
 
-/* Generate a NOP instruction of type T.  We will never generate L type
-   nops.  */
-
-static rtx
-gen_nop_type (t)
-     enum attr_type t;
-{
-  switch (t)
-    {
-    case TYPE_M:
-      return gen_nop_m ();
-    case TYPE_I:
-      return gen_nop_i ();
-    case TYPE_B:
-      return gen_nop_b ();
-    case TYPE_F:
-      return gen_nop_f ();
-    case TYPE_X:
-      return gen_nop_x ();
-    default:
-      abort ();
-    }
-}
-
-/* After the last scheduling pass, fill in NOPs.  It's easier to do this
-   here than while scheduling.  */
-
-static void
-ia64_emit_nops ()
-{
-  rtx insn;
-  const struct bundle *b = 0;
-  int bundle_pos = 0;
-
-  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
-    {
-      rtx pat;
-      enum attr_type t;
-      pat = INSN_P (insn) ? PATTERN (insn) : const0_rtx;
-      if (GET_CODE (pat) == USE || GET_CODE (pat) == CLOBBER)
-	continue;
-      if ((GET_CODE (pat) == UNSPEC && XINT (pat, 1) == UNSPEC_BUNDLE_SELECTOR)
-	  || GET_CODE (insn) == CODE_LABEL)
-	{
-	  if (b)
-	    while (bundle_pos < 3)
-	      {
-		emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn);
-		bundle_pos++;
-	      }
-	  if (GET_CODE (insn) != CODE_LABEL)
-	    b = bundle + INTVAL (XVECEXP (pat, 0, 0));
-	  else
-	    b = 0;
-	  bundle_pos = 0;
-	  continue;
-	}
-      else if (GET_CODE (pat) == UNSPEC_VOLATILE
-	       && XINT (pat, 1) == UNSPECV_INSN_GROUP_BARRIER)
-	{
-	  int t = INTVAL (XVECEXP (pat, 0, 0));
-	  if (b)
-	    while (bundle_pos < t)
-	      {
-		emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn);
-		bundle_pos++;
-	      }
-	  continue;
-	}
-
-      if (bundle_pos == 3)
-	b = 0;
-
-      if (b && INSN_P (insn))
-	{
-	  t = ia64_safe_type (insn);
-	  if (asm_noperands (PATTERN (insn)) >= 0
-	      || GET_CODE (PATTERN (insn)) == ASM_INPUT)
-	    {
-	      while (bundle_pos < 3)
-		{
-		  emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn);
-		  bundle_pos++;
-		}
-	      continue;
-	    }
-
-	  if (t == TYPE_UNKNOWN)
-	    continue;
-	  while (bundle_pos < 3)
-	    {
-	      if (t == b->t[bundle_pos]
-		  || (t == TYPE_A && (b->t[bundle_pos] == TYPE_M
-				      || b->t[bundle_pos] == TYPE_I)))
-		break;
-
-	      emit_insn_before (gen_nop_type (b->t[bundle_pos]), insn);
-	      bundle_pos++;
-	    }
-	  if (bundle_pos < 3)
-	    bundle_pos++;
-	}
-    }
-}
-
 /* Perform machine dependent operations on the rtl chain INSNS.  */
 
 void
@@ -7014,14 +7028,91 @@ ia64_reorg (insns)
     {
       timevar_push (TV_SCHED2);
       ia64_final_schedule = 1;
+
+      initiate_bundle_states ();
+      ia64_nop = make_insn_raw (gen_nop ());
+      PREV_INSN (ia64_nop) = NEXT_INSN (ia64_nop) = NULL_RTX;
+      recog_memoized (ia64_nop);
+      clocks_length = get_max_uid () + 1;
+      stops_p = (char *) xmalloc (clocks_length);
+      memset (stops_p, 0, clocks_length);
+      if (ia64_tune == PROCESSOR_ITANIUM)
+	{
+	  clocks = (int *) xmalloc (clocks_length * sizeof (int));
+	  memset (clocks, 0, clocks_length * sizeof (int));
+	  add_cycles = (int *) xmalloc (clocks_length * sizeof (int));
+	  memset (add_cycles, 0, clocks_length * sizeof (int));
+	}
+      if (ia64_tune == PROCESSOR_ITANIUM2)
+	{
+	  pos_1 = get_cpu_unit_code ("2_1");
+	  pos_2 = get_cpu_unit_code ("2_2");
+	  pos_3 = get_cpu_unit_code ("2_3");
+	  pos_4 = get_cpu_unit_code ("2_4");
+	  pos_5 = get_cpu_unit_code ("2_5");
+	  pos_6 = get_cpu_unit_code ("2_6");
+	  _0mii_ = get_cpu_unit_code ("2b_0mii.");
+	  _0mmi_ = get_cpu_unit_code ("2b_0mmi.");
+	  _0mfi_ = get_cpu_unit_code ("2b_0mfi.");
+	  _0mmf_ = get_cpu_unit_code ("2b_0mmf.");
+	  _0bbb_ = get_cpu_unit_code ("2b_0bbb.");
+	  _0mbb_ = get_cpu_unit_code ("2b_0mbb.");
+	  _0mib_ = get_cpu_unit_code ("2b_0mib.");
+	  _0mmb_ = get_cpu_unit_code ("2b_0mmb.");
+	  _0mfb_ = get_cpu_unit_code ("2b_0mfb.");
+	  _0mlx_ = get_cpu_unit_code ("2b_0mlx.");
+	  _1mii_ = get_cpu_unit_code ("2b_1mii.");
+	  _1mmi_ = get_cpu_unit_code ("2b_1mmi.");
+	  _1mfi_ = get_cpu_unit_code ("2b_1mfi.");
+	  _1mmf_ = get_cpu_unit_code ("2b_1mmf.");
+	  _1bbb_ = get_cpu_unit_code ("2b_1bbb.");
+	  _1mbb_ = get_cpu_unit_code ("2b_1mbb.");
+	  _1mib_ = get_cpu_unit_code ("2b_1mib.");
+	  _1mmb_ = get_cpu_unit_code ("2b_1mmb.");
+	  _1mfb_ = get_cpu_unit_code ("2b_1mfb.");
+	  _1mlx_ = get_cpu_unit_code ("2b_1mlx.");
+	}
+      else
+	{
+	  pos_1 = get_cpu_unit_code ("1_1");
+	  pos_2 = get_cpu_unit_code ("1_2");
+	  pos_3 = get_cpu_unit_code ("1_3");
+	  pos_4 = get_cpu_unit_code ("1_4");
+	  pos_5 = get_cpu_unit_code ("1_5");
+	  pos_6 = get_cpu_unit_code ("1_6");
+	  _0mii_ = get_cpu_unit_code ("1b_0mii.");
+	  _0mmi_ = get_cpu_unit_code ("1b_0mmi.");
+	  _0mfi_ = get_cpu_unit_code ("1b_0mfi.");
+	  _0mmf_ = get_cpu_unit_code ("1b_0mmf.");
+	  _0bbb_ = get_cpu_unit_code ("1b_0bbb.");
+	  _0mbb_ = get_cpu_unit_code ("1b_0mbb.");
+	  _0mib_ = get_cpu_unit_code ("1b_0mib.");
+	  _0mmb_ = get_cpu_unit_code ("1b_0mmb.");
+	  _0mfb_ = get_cpu_unit_code ("1b_0mfb.");
+	  _0mlx_ = get_cpu_unit_code ("1b_0mlx.");
+	  _1mii_ = get_cpu_unit_code ("1b_1mii.");
+	  _1mmi_ = get_cpu_unit_code ("1b_1mmi.");
+	  _1mfi_ = get_cpu_unit_code ("1b_1mfi.");
+	  _1mmf_ = get_cpu_unit_code ("1b_1mmf.");
+	  _1bbb_ = get_cpu_unit_code ("1b_1bbb.");
+	  _1mbb_ = get_cpu_unit_code ("1b_1mbb.");
+	  _1mib_ = get_cpu_unit_code ("1b_1mib.");
+	  _1mmb_ = get_cpu_unit_code ("1b_1mmb.");
+	  _1mfb_ = get_cpu_unit_code ("1b_1mfb.");
+	  _1mlx_ = get_cpu_unit_code ("1b_1mlx.");
+	}
       schedule_ebbs (rtl_dump_file);
+      finish_bundle_states ();
+      if (ia64_tune == PROCESSOR_ITANIUM)
+	{
+	  free (add_cycles);
+	  free (clocks);
+	}
+      free (stops_p);
+      emit_insn_group_barriers (rtl_dump_file, insns);
+
       ia64_final_schedule = 0;
       timevar_pop (TV_SCHED2);
-
-      /* This relies on the NOTE_INSN_BASIC_BLOCK notes to be in the same
-	 place as they were during scheduling.  */
-      emit_insn_group_barriers (rtl_dump_file, insns);
-      ia64_emit_nops ();
     }
   else
     emit_all_insn_group_barriers (rtl_dump_file, insns);