gcc/config/sparc/m8.md


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242

;; Scheduling description for the SPARC M8.
;;   Copyright (C) 2017 Free Software Foundation, Inc.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3.  If not see
;; <http://www.gnu.org/licenses/>.

;; Thigs to improve:
;;
;; - Store instructions are implemented by micro-ops, one of which
;;   generates the store address and is executed in the store address
;;   generation unit in the slot0.  We need to model that.
;;
;; - There are two V3 pipes connected to different slots.  The current
;;   implementation assumes that all the instructions executing in a
;;   V3 pipe are issued to the unit in slot3.
;;
;; - Single-issue ALU operations incur an additional cycle of latency to
;;   slot 0 and slot 1 instructions.  This is not currently reflected
;;   in the DFA.

(define_automaton "m8_0")

;; The S5 core has two dual-issue queues, PQLS and PQEX.  Each queue
;; is divided into two slots: PQLS corresponds to slots 0 and 1, and
;; PQEX corresponds to slots 2 and 3.  The core can issue 4
;; instructions per-cycle, and up to 4 instructions are committed each
;; cycle.
;;
;;                            
;;                   m8_slot0  - Load Unit.
;;                             - Store address gen. Unit.
;;                                                       
;;                            
;;   === PQLS ==>    m8_slot1  - Store data unit.
;;                             - Branch unit.
;;                                            
;;                             
;;   === PQEX ==>    m8_slot2  - Integer Unit (EXU2).                     
;;                             - 3-cycles Crypto Unit (SPU2).
;;                                                     
;;                   m8_slot3  - Integer Unit (EXU3).
;;                             - 3-cycles Crypto Unit (SPU3).
;;                             - Floating-point and graphics unit (FPG).
;;                             - Long-latency Crypto Unit.
;;                             - Oracle Numbers Unit (ONU).

(define_cpu_unit "m8_slot0,m8_slot1,m8_slot2,m8_slot3" "m8_0")

;; Some instructions stall the pipeline and avoid any other
;; instruction to be issued in the same cycle.  We assume the same for
;; multi-instruction insns.

(define_reservation "m8_single_issue" "m8_slot0 + m8_slot1 + m8_slot2 + m8_slot3")

(define_insn_reservation "m8_single" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "multi,savew,flushw,trap,bmask"))
  "m8_single_issue")

;; Most of the instructions executing in the integer units have a
;; latency of 1.

(define_insn_reservation "m8_integer" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "ialu,ialuX,shift,cmove,compare,bmask"))
  "(m8_slot2 | m8_slot3)")

;; Flushing the instruction memory takes 27 cycles.


(define_insn_reservation "m8_iflush" 27
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "iflush"))
  "(m8_slot2 | m8_slot3), nothing*26")

;; The integer multiplication instructions have a latency of 10 cycles
;; and execute in integer units.
;;
;; Likewise for array*, edge* and pdistn instructions.
;;
;; However, the latency is only 9 cycles if the consumer of the
;; operation is also capable of 9 cycles latency.  We model this with
;; a bypass.

(define_insn_reservation "m8_imul" 10
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "imul,array,edge,edgen,pdistn"))
  "(m8_slot2 | m8_slot3), nothing*12")

(define_bypass 9 "m8_imul" "m8_imul")

;; The integer division instructions `sdiv' and `udivx' have a latency
;; of 30 cycles and execute in integer units.

(define_insn_reservation "m8_idiv" 30
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "idiv"))
  "(m8_slot2 | m8_slot3), nothing*29")

;; Both integer and floating-point load instructions have a latency of
;; only 3 cycles,and execute in the slot0.
;;
;; Misaligned load instructions feature a latency of 11 cycles.
;;
;; The prefetch instruction also executes in the load unit, but it's
;; latency is only 1 cycle.

(define_insn_reservation "m8_load" 3
  (and (eq_attr "cpu" "m8")
       (ior (eq_attr "type" "fpload,sload")
            (and (eq_attr "type" "load")
                 (eq_attr "subtype" "regular"))))
  "m8_slot0, nothing*2")

;; (define_insn_reservation "m8_load_misalign" 11
;;  (and (eq_attr "cpu" "m8")
;;       (eq_attr "type" "load_mis,fpload_mis"))
;;  "m8_slot0, nothing*10")

(define_insn_reservation "m8_prefetch" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "load")
       (eq_attr "subtype" "prefetch"))
  "m8_slot0")

;; Both integer and floating-point store instructions have a latency
;; of 1 cycle, and execute in the store data unit in slot1.
;;
;; However, misaligned store instructions feature a latency of 3
;; cycles.

(define_insn_reservation "m8_store" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "store,fpstore"))
  "m8_slot1")

;; (define_insn_reservation "m8_store_misalign" 3
;;   (and (eq_attr "cpu" "m8")
;;        (eq_attr "type" "store_mis,fpstore_mis"))
;;   "m8_slot1, nothing*2")

;; Control-transfer instructions execute in the Branch Unit in the
;; slot1.

(define_insn_reservation "m8_cti" 1
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "cbcond,uncond_cbcond,branch,call,sibcall,call_no_delay_slot,uncond_branch,return"))
  "m8_slot1")

;; Many instructions executing in the Floating-point and Graphics Unit
;; (FGU) serving slot3 feature a default latency of 9 cycles.

(define_insn_reservation "m8_fp" 9
  (and (eq_attr "cpu" "m8")
       (ior (eq_attr "type" "fpmove,fpcmove,fpcrmove,fp,fpcmp,fpmul,fgm_pack,fgm_mul,pdist")
            (and (eq_attr "type" "fga")
                 (eq_attr "subtype" "fpu"))))
  "m8_slot3, nothing*8")

;; Floating-point division and floating-point square-root instructions
;; have high latencies.  They execute in the FGU.

(define_insn_reservation "m8_fpdivs" 26
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "fpdivs"))
  "m8_slot3, nothing*25")

(define_insn_reservation "m8_fpsqrts" 33
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "fpsqrts"))
  "m8_slot3, nothing*32")

(define_insn_reservation "m8_fpdivd" 30
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "fpdivd"))
  "m8_slot3, nothing*29")

(define_insn_reservation "m8_fpsqrtd" 41
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "fpsqrtd"))
  "m8_slot3, nothing*40")

;; SIMD VIS instructions executing in the Floating-point and graphics
;; unit (FPG) in slot3 usually have a latency of 5 cycles.
;;
;; However, the latency for many instructions is only 3 cycles if the
;; consumer can also be executed in 3 cycles.  We model this with a
;; bypass.  In these cases the instructions are executed in one of the
;; two 3-cycle crypto units (SPU, also known as "v3-pipes") in slots 2
;; and 3.

(define_insn_reservation "m8_vis" 5
  (and (eq_attr "cpu" "m8")
       (ior (eq_attr "type" "viscmp,lzd")
            (and (eq_attr "type" "fga")
                 (eq_attr "subtype" "maxmin,cmask,other"))
            (and (eq_attr "type" "vismv")
                 (eq_attr "subtype" "single,movstouw"))
            (and (eq_attr "type" "visl")
                 (eq_attr "subtype" "single"))))
  "m8_slot3, nothing*4")

(define_bypass 3 "m8_vis" "m8_vis")

(define_insn_reservation "m8_gsr" 5
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "gsr")
       (eq_attr "subtype" "alignaddr"))
  "m8_slot3, nothing*4")

;; A few VIS instructions have a latency of 1.

(define_insn_reservation "m8_vis_1cycle" 1
  (and (eq_attr "cpu" "m8")
       (ior (and (eq_attr "type" "vismv")
                 (eq_attr "subtype" "double,movxtod,movdtox"))
            (and (eq_attr "type" "visl")
                 (eq_attr "subtype" "double"))
            (and (eq_attr "type" "fga")
                 (eq_attr "subtype" "addsub64"))))
  "m8_slot3")

;; Reading and writing to the gsr register takes more than 70 cycles.

(define_insn_reservation "m8_gsr_reg" 70
  (and (eq_attr "cpu" "m8")
       (eq_attr "type" "gsr")
       (eq_attr "subtype" "reg"))
  "m8_slot3, nothing*69")