1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
|
/* cilk-abi-cilk-for.cpp -*-C++-*-
*
*************************************************************************
*
* @copyright
* Copyright (C) 2011, 2013, Intel Corporation
* All rights reserved.
*
* @copyright
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* @copyright
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************/
/* Implementation of cilk_for ABI.
*
* This file must be C++, not C, in order to handle C++ exceptions correctly
* from within the body of the cilk_for loop
*/
#include "internal/abi.h"
#include "metacall_impl.h"
#include "global_state.h"
// Icky macros to determine if we're compiled with optimization. Based on
// the declaration of __CILKRTS_ASSERT in common.h
#if defined(_WIN32)
# if defined (_DEBUG)
# define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od
# else
# define CILKRTS_OPTIMIZED 1
# endif // defined(_DEBUG)
#else
# if defined(__OPTIMIZE__)
# define CILKRTS_OPTIMIZED 1
# else
# define CILKRTS_OPTIMIZED 0
# endif
#endif
template <typename count_t>
static inline int grainsize(int req, count_t count)
{
// A positive requested grain size comes from the user. A very high grain
// size risks losing parallelism, but the user told us what they want for
// grainsize. Who are we to argue?
if (req > 0)
return req;
// At present, a negative requested grain size is treated the same way as
// a zero grain size, i.e., the runtime computes the actual grainsize
// using a hueristic. In the future, the compiler may give us additional
// information about the size of the cilk_for body by passing a negative
// grain size.
// Avoid generating a zero grainsize, even for empty loops.
if (count < 1)
return 1;
global_state_t* g = cilkg_get_global_state();
if (g->under_ptool)
{
// Grainsize = 1, when running under PIN, and when the grainsize has
// not explicitly been set by the user.
return 1;
}
else
{
// Divide loop count by 8 times the worker count and round up.
const int Px8 = g->P * 8;
count_t n = (count + Px8 - 1) / Px8;
// 2K should be enough to amortize the cost of the cilk_for. Any
// larger grainsize risks losing parallelism.
if (n > 2048)
return 2048;
return (int) n; // n <= 2048, so no loss of precision on cast to int
}
}
/*
* call_cilk_for_loop_body
*
* Centralizes the code to call the loop body. The compiler should be
* inlining this code
*
* low - Low loop index we're considering in this portion of the algorithm
* high - High loop index we're considering in this portion of the algorithm
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* w - __cilkrts_worker we're currently executing on
* loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
* the cilk_for loop to flatten out the internal nodes
*/
template <typename count_t, typename F>
inline static
void call_cilk_for_loop_body(count_t low, count_t high,
F body, void *data,
__cilkrts_worker *w,
__cilkrts_pedigree *loop_root_pedigree)
{
// Cilkscreen should not report this call in a stack trace
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
// The worker is only valid until the first spawn. Fetch the
// __cilkrts_stack_frame out of the worker, since it will be stable across
// steals. The sf pointer actually points to the *parent's*
// __cilkrts_stack_frame, since this function is a non-spawning function
// and therefore has no cilk stack frame of its own.
__cilkrts_stack_frame *sf = w->current_stack_frame;
// Save the pedigree node pointed to by the worker. We'll need to restore
// that when we exit since the spawn helpers in the cilk_for call tree
// will assume that it's valid
const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;
// Add the leaf pedigree node to the chain. The parent is the root node
// to flatten the tree regardless of the DAG branches in the cilk_for
// divide-and-conquer recursion.
//
// The rank is initialized to the low index. The user is
// expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
// loop body.
__cilkrts_pedigree loop_leaf_pedigree;
loop_leaf_pedigree.rank = (uint64_t)low;
loop_leaf_pedigree.parent = loop_root_pedigree;
// The worker's pedigree always starts with a rank of 0
w->pedigree.rank = 0;
w->pedigree.parent = &loop_leaf_pedigree;
// Call the compiler generated cilk_for loop body lambda function
body(data, low, high);
// The loop body may have included spawns, so we must refetch the worker
// from the __cilkrts_stack_frame, which is stable regardless of which
// worker we're executing on.
w = sf->worker;
// Restore the pedigree chain. It must be valid because the spawn helpers
// generated by the cilk_for implementation will access it.
w->pedigree.parent = saved_next_pedigree_node;
}
/* capture_spawn_arg_stack_frame
*
* Efficiently get the address of the caller's __cilkrts_stack_frame. The
* preconditons are that 'w' is the worker at the time of the call and
* 'w->current_stack_frame' points to the __cilkrts_stack_frame within the
* spawn helper. This function should be called only within the argument list
* of a function that is being spawned because that is the only situation in
* which these preconditions hold. This function returns the worker
* (unchanged) after storing the captured stack frame pointer is stored in the
* sf argument.
*
* The purpose of this function is to get the caller's stack frame in a
* context where the caller's worker is known but its stack frame is not
* necessarily initialized. The "shrink wrap" optimization delays
* initializing the contents of a spawning function's '__cilkrts_stack_frame'
* as well as the 'current_stack_frame' pointer within the worker. By calling
* this function within a spawning function's argument list, we can ensure
* that these initializations have occured but that a detach (which would
* invalidate the worker pointer in the caller) has not yet occured. Once the
* '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the
* remainder of the caller's execution, and becomes an efficient way to get
* the worker (much more efficient than calling '__cilkrts_get_tls_worker()'),
* even after a spawn or sync.
*/
inline __cilkrts_worker*
capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w)
{
// Get current stack frame
sf = w->current_stack_frame;
#ifdef __INTEL_COMPILER
# if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101
// In older compilers 'w->current_stack_frame' points to the
// spawn-helper's stack frame. In newer compiler's however, it points
// directly to the pointer's stack frame. (This change was made to avoid
// having the spawn helper in the frame list when evaluating function
// arguments, thus avoiding corruption when those arguments themselves
// contain cilk_spawns.)
// w->current_stack_frame is the spawn helper's stack frame.
// w->current_stack_frame->call_parent is the caller's stack frame.
sf = sf->call_parent;
# endif
#endif
return w;
}
/*
* cilk_for_recursive
*
* Templatized function to implement the recursive divide-and-conquer
* algorithm that's how we implement a cilk_for.
*
* low - Low loop index we're considering in this portion of the algorithm
* high - High loop index we're considering in this portion of the algorithm
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* grain - grain size (0 if it should be computed)
* w - __cilkrts_worker we're currently executing on
* loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
* the cilk_for loop to flatten out the internal nodes
*/
template <typename count_t, typename F>
static
void cilk_for_recursive(count_t low, count_t high,
F body, void *data, int grain,
__cilkrts_worker *w,
__cilkrts_pedigree *loop_root_pedigree)
{
tail_recurse:
// Cilkscreen should not report this call in a stack trace
// This needs to be done everytime the worker resumes
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
count_t count = high - low;
// Invariant: count > 0, grain >= 1
if (count > grain)
{
// Invariant: count >= 2
count_t mid = low + count / 2;
// The worker is valid only until the first spawn and is expensive to
// retrieve (using '__cilkrts_get_tls_worker') after the spawn. The
// '__cilkrts_stack_frame' is more stable, but isn't initialized until
// the first spawn. Thus, we want to grab the address of the
// '__cilkrts_stack_frame' after it is initialized but before the
// spawn detaches. The only place we can do that is within the
// argument list of the spawned function, hence the call to
// capture_spawn_arg_stack_frame().
__cilkrts_stack_frame *sf;
#if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
// The current version of gcc initializes the sf structure eagerly.
// We can take advantage of this fact to avoid calling
// `capture_spawn_arg_stack_frame` when compiling with gcc.
// Remove this if the "shrink-wrap" optimization is implemented.
sf = w->current_stack_frame;
_Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
loop_root_pedigree);
#else
_Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
capture_spawn_arg_stack_frame(sf, w),
loop_root_pedigree);
#endif
w = sf->worker;
low = mid;
goto tail_recurse;
}
// Call the cilk_for loop body lambda function passed in by the compiler to
// execute one grain
call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
}
static void noop() { }
/*
* cilk_for_root
*
* Templatized function to implement the top level of a cilk_for loop.
*
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* count - trip count for loop
* grain - grain size (0 if it should be computed)
*/
template <typename count_t, typename F>
static void cilk_for_root(F body, void *data, count_t count, int grain)
{
// Cilkscreen should not report this call in a stack trace
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
// Pedigree computation:
//
// If the last pedigree node on entry to the _Cilk_for has value X,
// then at the start of each iteration of the loop body, the value of
// the last pedigree node should be 0, the value of the second-to-last
// node should equal the loop counter, and the value of the
// third-to-last node should be X. On return from the _Cilk_for, the
// value of the last pedigree should be incremented to X+2. The
// pedigree within the loop is thus flattened, such that the depth of
// recursion does not affect the results either inside or outside of
// the loop. Note that the pedigree after the loop exists is the same
// as if a single spawn and sync were executed within this function.
// TBD: Since the shrink-wrap optimization was turned on in the compiler,
// it is not possible to get the current stack frame without actually
// forcing a call to bind-thread. This spurious spawn is a temporary
// stopgap until the correct intrinsics are added to give us total control
// over frame initialization.
_Cilk_spawn noop();
// Fetch the current worker. From that we can get the current stack frame
// which will be constant even if we're stolen
__cilkrts_worker *w = __cilkrts_get_tls_worker();
__cilkrts_stack_frame *sf = w->current_stack_frame;
// Decrement the rank by one to undo the pedigree change from the
// _Cilk_spawn
--w->pedigree.rank;
// Save the current worker pedigree into loop_root_pedigree, which will be
// the root node for our flattened pedigree.
__cilkrts_pedigree loop_root_pedigree = w->pedigree;
// Don't splice the loop_root node in yet. It will be done when we
// call the loop body lambda function
// w->pedigree.rank = 0;
// w->pedigree.next = &loop_root_pedigree;
/* Spawn is necessary at top-level to force runtime to start up.
* Runtime must be started in order to call the grainsize() function.
*/
int gs = grainsize(grain, count);
cilk_for_recursive((count_t) 0, count, body, data, gs, w,
&loop_root_pedigree);
// Need to refetch the worker after calling a spawning function.
w = sf->worker;
// Restore the pedigree in the worker.
w->pedigree = loop_root_pedigree;
// Bump the worker pedigree.
++w->pedigree.rank;
// Implicit sync will increment the pedigree leaf rank again, for a total
// of two increments. If the noop spawn above is removed, then we'll need
// to re-enable the following code:
// // If this is an optimized build, then the compiler will have optimized
// // out the increment of the worker's pedigree in the implied sync. We
// // need to add one to make the pedigree_loop test work correctly.
// #if CILKRTS_OPTIMIZED
// ++sf->worker->pedigree.rank;
// #endif
}
// Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and
// __cilkrts_cilk_for_64.
extern "C" {
/*
* __cilkrts_cilk_for_32
*
* Implementation of cilk_for for 32-bit trip counts (regardless of processor
* word size). Assumes that the range is 0 - count.
*
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* count - trip count for loop
* grain - grain size (0 if it should be computed)
*/
CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data,
cilk32_t count, int grain)
{
// Cilkscreen should not report this call in a stack trace
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
// Check for an empty range here as an optimization - don't need to do any
// __cilkrts_stack_frame initialization
if (count > 0)
cilk_for_root(body, data, count, grain);
}
/*
* __cilkrts_cilk_for_64
*
* Implementation of cilk_for for 64-bit trip counts (regardless of processor
* word size). Assumes that the range is 0 - count.
*
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* count - trip count for loop
* grain - grain size (0 if it should be computed)
*/
CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
cilk64_t count, int grain)
{
// Check for an empty range here as an optimization - don't need to do any
// __cilkrts_stack_frame initialization
if (count > 0)
cilk_for_root(body, data, count, grain);
}
} // end extern "C"
/* End cilk-abi-cilk-for.cpp */
|