summaryrefslogtreecommitdiff
path: root/asmcomp/ia64/selection.ml
blob: 6be4a18cdb9e7665cb2456e01f48b12f2af99bb4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
(***********************************************************************)
(*                                                                     *)
(*                           Objective Caml                            *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2000 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  All rights reserved.  This file is distributed    *)
(*  under the terms of the Q Public License version 1.0.               *)
(*                                                                     *)
(***********************************************************************)

(* $Id$ *)

(* Instruction selection for the IA64 processor *)

open Misc
open Cmm
open Reg
open Arch
open Mach

(* Helper function for add selection *)

let reassociate_add = function
    [Cconst_int n; arg] ->
        [arg; Cconst_int n]
  | [Cop(Caddi, [arg1; Cconst_int n]); arg3] ->
        [Cop(Caddi, [arg1; arg3]); Cconst_int n]
  | [Cop(Caddi, [Cconst_int n; arg1]); arg3] ->
        [Cop(Caddi, [arg1; arg3]); Cconst_int n]
  | [arg1; Cop(Caddi, [Cconst_int n; arg3])] ->
        [Cop(Caddi, [arg1; arg3]); Cconst_int n]
  | [arg1; Cop(Caddi, [arg2; arg3])] ->
        [Cop(Caddi, [arg1; arg2]); arg3]
  | args -> args

(* Helper function for mult-immediate selection *)

let rec count_one_bits n =
  if n = 0 then 0
  else if n land 1 = 0 then count_one_bits (n lsr 1)
  else 1 + count_one_bits (n lsr 1)

class selector = object (self)

inherit Selectgen.selector_generic as super

(* Range of immediate arguments:
     add                14-bit signed
     sub                turned into add
     sub reversed       8-bit signed
     mul                at most 16 "one" bits
     div, mod           powers of 2
     and, or, xor       8-bit signed
     lsl, lsr, asr      6-bit unsigned
     cmp                8-bit signed
   For is_immediate, we put 8-bit signed and treat adds specially
   (selectgen already does the right thing for shifts) *)

method is_immediate n = n >= -128 && n < 128

method is_immediate_add n = n >= -8192 && n < 8192

method select_addressing arg = (Iindexed, arg)

method! select_operation op args =
  let norm_op =
    match op with Cadda -> Caddi | Csuba -> Csubi | _ -> op in
  let norm_args =
    match norm_op with Caddi -> reassociate_add args | _ -> args in
  match (norm_op, norm_args) with
  (* Recognize x + y + 1 and x - y - 1 *)
  | (Caddi, [Cop(Caddi, [arg1; arg2]); Cconst_int 1]) ->
      (Ispecific Iadd1, [arg1; arg2])
  | (Caddi, [Cop(Clsl, [arg1; Cconst_int 1]); Cconst_int 1]) ->
      (Ispecific Iadd1, [arg1])
  | (Csubi, [Cop(Csubi, [arg1; arg2]); Cconst_int 1]) ->
      (Ispecific Isub1, [arg1; arg2])
  | (Csubi, [Cop(Csubi, [arg1; Cconst_int 1]); arg2]) ->
      (Ispecific Isub1, [arg1; arg2])
  (* Recognize add immediate *)
  | (Caddi, [arg; Cconst_int n]) when self#is_immediate_add n ->
      (Iintop_imm(Iadd, n), [arg])
  (* Turn sub immediate into add immediate *)
  | (Csubi, [arg; Cconst_int n]) when self#is_immediate_add (-n) ->
      (Iintop_imm(Iadd, -n), [arg])
  (* Recognize imm - arg *)
  | (Csubi, [Cconst_int n; arg]) when self#is_immediate n ->
      (Iintop_imm(Isub, n), [arg])
  (* Recognize shift-add operations *)
  | (Caddi, [arg2; Cop(Clsl, [arg1; Cconst_int(1|2|3|4 as shift)])]) ->
      (Ispecific(Ishladd shift), [arg1; arg2])
  | (Caddi, [Cop(Clsl, [arg1; Cconst_int(1|2|3|4 as shift)]); arg2]) ->
      (Ispecific(Ishladd shift), [arg1; arg2])
  (* Recognize truncation/normalization of 64-bit integers to 32 bits *)
  | (Casr, [Cop(Clsl, [arg; Cconst_int 32]); Cconst_int 32]) ->
      (Ispecific (Isignextend 4), [arg])
  (* Recognize x * cst and cst * x *)
  | (Cmuli, [arg; Cconst_int n]) ->
      self#select_imul_imm arg n
  | (Cmuli, [Cconst_int n; arg]) ->
      self#select_imul_imm arg n
  (* Prevent the recognition of (x / cst) and (x % cst) when cst is not
     a power of 2, which do not correspond to an instruction.
     Turn general division and modulus into calls to C library functions *)
  | (Cdivi, [arg; Cconst_int n]) when n = 1 lsl (Misc.log2 n) ->
      (Iintop_imm(Idiv, n), [arg])
  | (Cdivi, _) ->
      (Iextcall("__divdi3", false), args)
  | (Cmodi, [arg; Cconst_int n]) when n = 1 lsl (Misc.log2 n) && n <> 1 ->
      (Iintop_imm(Imod, n), [arg])
  | (Cmodi, _) ->
      (Iextcall("__moddi3", false), args)
  (* Recognize mult-add and mult-sub instructions *)
  | (Caddf, [Cop(Cmulf, [arg1; arg2]); arg3]) ->
      (Ispecific Imultaddf, [arg1; arg2; arg3])
  | (Caddf, [arg3; Cop(Cmulf, [arg1; arg2])]) ->
      (Ispecific Imultaddf, [arg1; arg2; arg3])
  | (Csubf, [Cop(Cmulf, [arg1; arg2]); arg3]) ->
      (Ispecific Imultsubf, [arg1; arg2; arg3])
  | (Csubf, [arg3; Cop(Cmulf, [arg1; arg2])]) ->
      (Ispecific Isubmultf, [arg1; arg2; arg3])
  (* Use default selector otherwise *)
  | _ ->
      super#select_operation op args

method private select_imul_imm arg n =
  if count_one_bits n <= 16
  then (Iintop_imm(Imul, n), [arg])
  else (Iintop Imul, [arg; Cconst_int n])

(* To palliate the lack of addressing with displacement, multiple
   stores to the address r are translated as follows
   (t1 and t2 are two temp regs)
      t1 := r - 8
      t2 := r
      compute data1 in reg1
      compute data2 in reg2
      store reg1 at t1 and increment t1 by 16
      store reg2 at t2 and increment t2 by 16
      compute data3 in reg3
      compute data4 in reg4
      store reg3 at t1 and increment t1 by 16
      store reg4 at t2 and increment t2 by 16
      ...
    Note: we use two temp regs and perform stores by groups of 2
    in order to expose more instruction-level parallelism. *)
method! emit_stores env data regs_addr =
  let t1 = Reg.create Addr and t2 = Reg.create Addr in
  self#insert (Iop(Iintop_imm(Iadd, -8))) regs_addr [|t1|];
  self#insert (Iop Imove) regs_addr [|t2|];
  (* Store components by batch of 2 *)
  let backlog = ref None in
  let do_store r =
    match !backlog with
      None -> (* keep it for later *)
        backlog := Some r
    | Some r' -> (* store r' at t1 and r at t2 *)
        self#insert (Iop(Ispecific(Istoreincr 16))) [| t1; r' |] [| t1 |];
        self#insert (Iop(Ispecific(Istoreincr 16))) [| t2; r  |] [| t2 |];
        backlog := None in
  List.iter
    (fun exp ->
      match self#emit_expr env exp with
        None -> assert false
      | Some regs -> Array.iter do_store regs)
    data;
  (* Store the backlog if any *)
  begin match !backlog with
    None -> ()
  | Some r -> self#insert (Iop(Ispecific(Istoreincr 16))) [| t1; r |] [| t1 |]
  end;
  (* Insert an init barrier *)
  self#insert (Iop(Ispecific Iinitbarrier)) [||] [||]
end

let fundecl f = (new selector)#emit_fundecl f