1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
|
// Copyright 2014 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/compiler/backend/gap-resolver.h"
#include <algorithm>
#include <set>
#include "src/base/enum-set.h"
#include "src/codegen/register-configuration.h"
namespace v8 {
namespace internal {
namespace compiler {
namespace {
// Splits a FP move between two location operands into the equivalent series of
// moves between smaller sub-operands, e.g. a double move to two single moves.
// This helps reduce the number of cycles that would normally occur under FP
// aliasing, and makes swaps much easier to implement.
MoveOperands* Split(MoveOperands* move, MachineRepresentation smaller_rep,
ParallelMove* moves) {
DCHECK(kFPAliasing == AliasingKind::kCombine);
// Splitting is only possible when the slot size is the same as float size.
DCHECK_EQ(kSystemPointerSize, kFloatSize);
const LocationOperand& src_loc = LocationOperand::cast(move->source());
const LocationOperand& dst_loc = LocationOperand::cast(move->destination());
MachineRepresentation dst_rep = dst_loc.representation();
DCHECK_NE(smaller_rep, dst_rep);
auto src_kind = src_loc.location_kind();
auto dst_kind = dst_loc.location_kind();
int aliases =
1 << (ElementSizeLog2Of(dst_rep) - ElementSizeLog2Of(smaller_rep));
int base = -1;
USE(base);
DCHECK_EQ(aliases, RegisterConfiguration::Default()->GetAliases(
dst_rep, 0, smaller_rep, &base));
int src_index = -1;
int slot_size = (1 << ElementSizeLog2Of(smaller_rep)) / kSystemPointerSize;
int src_step = 1;
if (src_kind == LocationOperand::REGISTER) {
src_index = src_loc.register_code() * aliases;
} else {
src_index = src_loc.index();
// For operands that occupy multiple slots, the index refers to the last
// slot. On little-endian architectures, we start at the high slot and use a
// negative step so that register-to-slot moves are in the correct order.
src_step = -slot_size;
}
int dst_index = -1;
int dst_step = 1;
if (dst_kind == LocationOperand::REGISTER) {
dst_index = dst_loc.register_code() * aliases;
} else {
dst_index = dst_loc.index();
dst_step = -slot_size;
}
// Reuse 'move' for the first fragment. It is not pending.
move->set_source(AllocatedOperand(src_kind, smaller_rep, src_index));
move->set_destination(AllocatedOperand(dst_kind, smaller_rep, dst_index));
// Add the remaining fragment moves.
for (int i = 1; i < aliases; ++i) {
src_index += src_step;
dst_index += dst_step;
moves->AddMove(AllocatedOperand(src_kind, smaller_rep, src_index),
AllocatedOperand(dst_kind, smaller_rep, dst_index));
}
// Return the first fragment.
return move;
}
enum MoveOperandKind : uint8_t { kConstant, kGpReg, kFpReg, kStack };
MoveOperandKind GetKind(const InstructionOperand& move) {
if (move.IsConstant()) return kConstant;
LocationOperand loc_op = LocationOperand::cast(move);
if (loc_op.location_kind() != LocationOperand::REGISTER) return kStack;
return IsFloatingPoint(loc_op.representation()) ? kFpReg : kGpReg;
}
} // namespace
void GapResolver::Resolve(ParallelMove* moves) {
base::EnumSet<MoveOperandKind, uint8_t> source_kinds;
base::EnumSet<MoveOperandKind, uint8_t> destination_kinds;
// Remove redundant moves, collect source kinds and destination kinds to
// detect simple non-overlapping moves, and collect FP move representations if
// aliasing is non-simple.
int fp_reps = 0;
size_t nmoves = moves->size();
for (size_t i = 0; i < nmoves;) {
MoveOperands* move = (*moves)[i];
if (move->IsRedundant()) {
nmoves--;
if (i < nmoves) (*moves)[i] = (*moves)[nmoves];
continue;
}
i++;
source_kinds.Add(GetKind(move->source()));
destination_kinds.Add(GetKind(move->destination()));
if (kFPAliasing == AliasingKind::kCombine &&
move->destination().IsFPRegister()) {
fp_reps |= RepresentationBit(
LocationOperand::cast(move->destination()).representation());
}
}
if (nmoves != moves->size()) moves->resize(nmoves);
if ((source_kinds & destination_kinds).empty() || moves->size() < 2) {
// Fast path for non-conflicting parallel moves.
for (MoveOperands* move : *moves) {
assembler_->AssembleMove(&move->source(), &move->destination());
}
return;
}
if (kFPAliasing == AliasingKind::kCombine) {
if (fp_reps && !base::bits::IsPowerOfTwo(fp_reps)) {
// Start with the smallest FP moves, so we never encounter smaller moves
// in the middle of a cycle of larger moves.
if ((fp_reps & RepresentationBit(MachineRepresentation::kFloat32)) != 0) {
split_rep_ = MachineRepresentation::kFloat32;
for (size_t i = 0; i < moves->size(); ++i) {
auto move = (*moves)[i];
if (!move->IsEliminated() && move->destination().IsFloatRegister())
PerformMove(moves, move);
}
}
if ((fp_reps & RepresentationBit(MachineRepresentation::kFloat64)) != 0) {
split_rep_ = MachineRepresentation::kFloat64;
for (size_t i = 0; i < moves->size(); ++i) {
auto move = (*moves)[i];
if (!move->IsEliminated() && move->destination().IsDoubleRegister())
PerformMove(moves, move);
}
}
}
split_rep_ = MachineRepresentation::kSimd128;
}
for (size_t i = 0; i < moves->size(); ++i) {
auto move = (*moves)[i];
if (!move->IsEliminated()) PerformMove(moves, move);
}
}
void GapResolver::PerformMove(ParallelMove* moves, MoveOperands* move) {
// {PerformMoveHelper} assembles all the moves that block {move} but are not
// blocked by {move} (directly or indirectly). By the assumptions described in
// {PerformMoveHelper}, at most one cycle is left. If it exists, it is
// returned and we assemble it below.
auto cycle = PerformMoveHelper(moves, move);
if (!cycle.has_value()) return;
DCHECK_EQ(cycle->back(), move);
if (cycle->size() == 2) {
// A cycle of size two where the two moves have the same machine
// representation is a swap. For this case, call {AssembleSwap} which can
// generate better code than the generic algorithm below in some cases.
MoveOperands* move2 = cycle->front();
MachineRepresentation rep =
LocationOperand::cast(move->source()).representation();
MachineRepresentation rep2 =
LocationOperand::cast(move2->source()).representation();
if (rep == rep2) {
InstructionOperand* source = &move->source();
InstructionOperand* destination = &move->destination();
// Ensure source is a register or both are stack slots, to limit swap
// cases.
if (source->IsAnyStackSlot()) {
std::swap(source, destination);
}
assembler_->AssembleSwap(source, destination);
move->Eliminate();
move2->Eliminate();
return;
}
}
// Generic move-cycle algorithm. The cycle of size n is ordered such that the
// move at index i % n blocks the move at index (i + 1) % n.
// - Move the source of the last move to a platform-specific temporary
// location.
// - Assemble the remaining moves from left to right. The first move was
// unblocked by the temporary location, and each move unblocks the next one.
// - Move the temporary location to the last move's destination, thereby
// completing the cycle.
// To ensure that the temporary location does not conflict with any scratch
// register used during the move cycle, the platform implements
// {SetPendingMove}, which marks the registers needed for the given moves.
// {MoveToTempLocation} will then choose the location accordingly.
MachineRepresentation rep =
LocationOperand::cast(move->source()).representation();
cycle->pop_back();
for (auto* pending_move : *cycle) {
assembler_->SetPendingMove(pending_move);
}
assembler_->MoveToTempLocation(&move->source());
InstructionOperand destination = move->destination();
move->Eliminate();
for (auto* unblocked_move : *cycle) {
assembler_->AssembleMove(&unblocked_move->source(),
&unblocked_move->destination());
unblocked_move->Eliminate();
}
assembler_->MoveTempLocationTo(&destination, rep);
}
base::Optional<std::vector<MoveOperands*>> GapResolver::PerformMoveHelper(
ParallelMove* moves, MoveOperands* move) {
// We interpret moves as nodes in a graph. x is a successor of y (x blocks y)
// if x.source() conflicts with y.destination(). We recursively assemble the
// moves in this graph in post-order using a DFS traversal, such that all
// blocking moves are assembled first.
// We also mark moves in the current DFS branch as pending. If a move is
// blocked by a pending move, this is a cycle. In this case we just return the
// cycle without assembling the moves, and the cycle is processed separately
// by {PerformMove}.
// We can show that there is at most one cycle in this connected component
// with the two following assumptions:
// - Two moves cannot have conflicting destinations (1)
// - Operand conflict is transitive (2)
// From this, it it follows that:
// - A move cannot block two or more moves (or the destinations of the blocked
// moves would conflict with each other by (2)).
// - Therefore the graph is a tree, except possibly for one cycle that goes
// back to the root
// (1) must hold by construction of parallel moves. (2) is generally true,
// except if this is a tail-call gap move and some operands span multiple
// stack slots. In this case, slots can partially overlap and interference is
// not transitive. In other cases, conflicting stack operands should have the
// same base address and machine representation.
// TODO(thibaudm): Fix the tail-call case.
DCHECK(!move->IsPending());
DCHECK(!move->IsRedundant());
// Clear this move's destination to indicate a pending move. The actual
// destination is saved on the side.
InstructionOperand source = move->source();
DCHECK(!source.IsInvalid()); // Or else it will look eliminated.
InstructionOperand destination = move->destination();
move->SetPending();
base::Optional<std::vector<MoveOperands*>> cycle;
// We may need to split moves between FP locations differently.
const bool is_fp_loc_move = kFPAliasing == AliasingKind::kCombine &&
destination.IsFPLocationOperand();
for (size_t i = 0; i < moves->size(); ++i) {
auto other = (*moves)[i];
if (other->IsEliminated()) continue;
if (other == move) continue;
if (other->source().InterferesWith(destination)) {
if (other->IsPending()) {
// The conflicting move is pending, we found a cycle. Build the list of
// moves that belong to the cycle on the way back.
cycle.emplace();
} else {
// Recursively perform the conflicting move.
if (is_fp_loc_move &&
LocationOperand::cast(other->source()).representation() >
split_rep_) {
// 'other' must also be an FP location move. Break it into fragments
// of the same size as 'move'. 'other' is set to one of the fragments,
// and the rest are appended to 'moves'.
other = Split(other, split_rep_, moves);
// 'other' may not block destination now.
if (!other->source().InterferesWith(destination)) continue;
}
auto cycle_rec = PerformMoveHelper(moves, other);
if (cycle_rec.has_value()) {
// Check that our assumption that there is at most one cycle is true.
DCHECK(!cycle.has_value());
cycle = cycle_rec;
}
}
}
}
// We finished processing all the blocking moves and don't need this one
// marked as pending anymore, restore its destination.
move->set_destination(destination);
if (cycle.has_value()) {
// Do not assemble the moves in the cycle, just return it.
cycle->push_back(move);
return cycle;
}
assembler_->AssembleMove(&source, &destination);
move->Eliminate();
return {};
}
} // namespace compiler
} // namespace internal
} // namespace v8
|