/* cilk-abi-cilk-for.cpp -*-C++-*- * ************************************************************************* * * @copyright * Copyright (C) 2011, 2013, Intel Corporation * All rights reserved. * * @copyright * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * Neither the name of Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * @copyright * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * **************************************************************************/ /* Implementation of cilk_for ABI. * * This file must be C++, not C, in order to handle C++ exceptions correctly * from within the body of the cilk_for loop */ #include "internal/abi.h" #include "metacall_impl.h" #include "global_state.h" // Icky macros to determine if we're compiled with optimization. Based on // the declaration of __CILKRTS_ASSERT in common.h #if defined(_WIN32) # if defined (_DEBUG) # define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od # else # define CILKRTS_OPTIMIZED 1 # endif // defined(_DEBUG) #else # if defined(__OPTIMIZE__) # define CILKRTS_OPTIMIZED 1 # else # define CILKRTS_OPTIMIZED 0 # endif #endif template static inline int grainsize(int req, count_t count) { // A positive requested grain size comes from the user. A very high grain // size risks losing parallelism, but the user told us what they want for // grainsize. Who are we to argue? if (req > 0) return req; // At present, a negative requested grain size is treated the same way as // a zero grain size, i.e., the runtime computes the actual grainsize // using a hueristic. In the future, the compiler may give us additional // information about the size of the cilk_for body by passing a negative // grain size. // Avoid generating a zero grainsize, even for empty loops. if (count < 1) return 1; global_state_t* g = cilkg_get_global_state(); if (g->under_ptool) { // Grainsize = 1, when running under PIN, and when the grainsize has // not explicitly been set by the user. return 1; } else { // Divide loop count by 8 times the worker count and round up. const int Px8 = g->P * 8; count_t n = (count + Px8 - 1) / Px8; // 2K should be enough to amortize the cost of the cilk_for. Any // larger grainsize risks losing parallelism. if (n > 2048) return 2048; return (int) n; // n <= 2048, so no loss of precision on cast to int } } /* * call_cilk_for_loop_body * * Centralizes the code to call the loop body. The compiler should be * inlining this code * * low - Low loop index we're considering in this portion of the algorithm * high - High loop index we're considering in this portion of the algorithm * body - lambda function for the cilk_for loop body * data - data used by the lambda function * w - __cilkrts_worker we're currently executing on * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of * the cilk_for loop to flatten out the internal nodes */ template inline static void call_cilk_for_loop_body(count_t low, count_t high, F body, void *data, __cilkrts_worker *w, __cilkrts_pedigree *loop_root_pedigree) { // Cilkscreen should not report this call in a stack trace NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); // The worker is only valid until the first spawn. Fetch the // __cilkrts_stack_frame out of the worker, since it will be stable across // steals. The sf pointer actually points to the *parent's* // __cilkrts_stack_frame, since this function is a non-spawning function // and therefore has no cilk stack frame of its own. __cilkrts_stack_frame *sf = w->current_stack_frame; // Save the pedigree node pointed to by the worker. We'll need to restore // that when we exit since the spawn helpers in the cilk_for call tree // will assume that it's valid const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent; // Add the leaf pedigree node to the chain. The parent is the root node // to flatten the tree regardless of the DAG branches in the cilk_for // divide-and-conquer recursion. // // The rank is initialized to the low index. The user is // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for // loop body. __cilkrts_pedigree loop_leaf_pedigree; loop_leaf_pedigree.rank = (uint64_t)low; loop_leaf_pedigree.parent = loop_root_pedigree; // The worker's pedigree always starts with a rank of 0 w->pedigree.rank = 0; w->pedigree.parent = &loop_leaf_pedigree; // Call the compiler generated cilk_for loop body lambda function body(data, low, high); // The loop body may have included spawns, so we must refetch the worker // from the __cilkrts_stack_frame, which is stable regardless of which // worker we're executing on. w = sf->worker; // Restore the pedigree chain. It must be valid because the spawn helpers // generated by the cilk_for implementation will access it. w->pedigree.parent = saved_next_pedigree_node; } /* capture_spawn_arg_stack_frame * * Efficiently get the address of the caller's __cilkrts_stack_frame. The * preconditons are that 'w' is the worker at the time of the call and * 'w->current_stack_frame' points to the __cilkrts_stack_frame within the * spawn helper. This function should be called only within the argument list * of a function that is being spawned because that is the only situation in * which these preconditions hold. This function returns the worker * (unchanged) after storing the captured stack frame pointer is stored in the * sf argument. * * The purpose of this function is to get the caller's stack frame in a * context where the caller's worker is known but its stack frame is not * necessarily initialized. The "shrink wrap" optimization delays * initializing the contents of a spawning function's '__cilkrts_stack_frame' * as well as the 'current_stack_frame' pointer within the worker. By calling * this function within a spawning function's argument list, we can ensure * that these initializations have occured but that a detach (which would * invalidate the worker pointer in the caller) has not yet occured. Once the * '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the * remainder of the caller's execution, and becomes an efficient way to get * the worker (much more efficient than calling '__cilkrts_get_tls_worker()'), * even after a spawn or sync. */ inline __cilkrts_worker* capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w) { // Get current stack frame sf = w->current_stack_frame; #ifdef __INTEL_COMPILER # if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101 // In older compilers 'w->current_stack_frame' points to the // spawn-helper's stack frame. In newer compiler's however, it points // directly to the pointer's stack frame. (This change was made to avoid // having the spawn helper in the frame list when evaluating function // arguments, thus avoiding corruption when those arguments themselves // contain cilk_spawns.) // w->current_stack_frame is the spawn helper's stack frame. // w->current_stack_frame->call_parent is the caller's stack frame. sf = sf->call_parent; # endif #endif return w; } /* * cilk_for_recursive * * Templatized function to implement the recursive divide-and-conquer * algorithm that's how we implement a cilk_for. * * low - Low loop index we're considering in this portion of the algorithm * high - High loop index we're considering in this portion of the algorithm * body - lambda function for the cilk_for loop body * data - data used by the lambda function * grain - grain size (0 if it should be computed) * w - __cilkrts_worker we're currently executing on * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of * the cilk_for loop to flatten out the internal nodes */ template static void cilk_for_recursive(count_t low, count_t high, F body, void *data, int grain, __cilkrts_worker *w, __cilkrts_pedigree *loop_root_pedigree) { tail_recurse: // Cilkscreen should not report this call in a stack trace // This needs to be done everytime the worker resumes NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); count_t count = high - low; // Invariant: count > 0, grain >= 1 if (count > grain) { // Invariant: count >= 2 count_t mid = low + count / 2; // The worker is valid only until the first spawn and is expensive to // retrieve (using '__cilkrts_get_tls_worker') after the spawn. The // '__cilkrts_stack_frame' is more stable, but isn't initialized until // the first spawn. Thus, we want to grab the address of the // '__cilkrts_stack_frame' after it is initialized but before the // spawn detaches. The only place we can do that is within the // argument list of the spawned function, hence the call to // capture_spawn_arg_stack_frame(). __cilkrts_stack_frame *sf; _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, capture_spawn_arg_stack_frame(sf, w), loop_root_pedigree); w = sf->worker; low = mid; goto tail_recurse; } // Call the cilk_for loop body lambda function passed in by the compiler to // execute one grain call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree); } static void noop() { } /* * cilk_for_root * * Templatized function to implement the top level of a cilk_for loop. * * body - lambda function for the cilk_for loop body * data - data used by the lambda function * count - trip count for loop * grain - grain size (0 if it should be computed) */ template static void cilk_for_root(F body, void *data, count_t count, int grain) { // Cilkscreen should not report this call in a stack trace NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); // Pedigree computation: // // If the last pedigree node on entry to the _Cilk_for has value X, // then at the start of each iteration of the loop body, the value of // the last pedigree node should be 0, the value of the second-to-last // node should equal the loop counter, and the value of the // third-to-last node should be X. On return from the _Cilk_for, the // value of the last pedigree should be incremented to X+2. The // pedigree within the loop is thus flattened, such that the depth of // recursion does not affect the results either inside or outside of // the loop. Note that the pedigree after the loop exists is the same // as if a single spawn and sync were executed within this function. // TBD: Since the shrink-wrap optimization was turned on in the compiler, // it is not possible to get the current stack frame without actually // forcing a call to bind-thread. This spurious spawn is a temporary // stopgap until the correct intrinsics are added to give us total control // over frame initialization. _Cilk_spawn noop(); // Fetch the current worker. From that we can get the current stack frame // which will be constant even if we're stolen __cilkrts_worker *w = __cilkrts_get_tls_worker(); __cilkrts_stack_frame *sf = w->current_stack_frame; // Decrement the rank by one to undo the pedigree change from the // _Cilk_spawn --w->pedigree.rank; // Save the current worker pedigree into loop_root_pedigree, which will be // the root node for our flattened pedigree. __cilkrts_pedigree loop_root_pedigree = w->pedigree; // Don't splice the loop_root node in yet. It will be done when we // call the loop body lambda function // w->pedigree.rank = 0; // w->pedigree.next = &loop_root_pedigree; /* Spawn is necessary at top-level to force runtime to start up. * Runtime must be started in order to call the grainsize() function. */ int gs = grainsize(grain, count); cilk_for_recursive((count_t) 0, count, body, data, gs, w, &loop_root_pedigree); // Need to refetch the worker after calling a spawning function. w = sf->worker; // Restore the pedigree in the worker. w->pedigree = loop_root_pedigree; // Bump the worker pedigree. ++w->pedigree.rank; // Implicit sync will increment the pedigree leaf rank again, for a total // of two increments. If the noop spawn above is removed, then we'll need // to re-enable the following code: // // If this is an optimized build, then the compiler will have optimized // // out the increment of the worker's pedigree in the implied sync. We // // need to add one to make the pedigree_loop test work correctly. // #if CILKRTS_OPTIMIZED // ++sf->worker->pedigree.rank; // #endif } // Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and // __cilkrts_cilk_for_64. extern "C" { /* * __cilkrts_cilk_for_32 * * Implementation of cilk_for for 32-bit trip counts (regardless of processor * word size). Assumes that the range is 0 - count. * * body - lambda function for the cilk_for loop body * data - data used by the lambda function * count - trip count for loop * grain - grain size (0 if it should be computed) */ CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data, cilk32_t count, int grain) { // Cilkscreen should not report this call in a stack trace NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); // Check for an empty range here as an optimization - don't need to do any // __cilkrts_stack_frame initialization if (count > 0) cilk_for_root(body, data, count, grain); } /* * __cilkrts_cilk_for_64 * * Implementation of cilk_for for 64-bit trip counts (regardless of processor * word size). Assumes that the range is 0 - count. * * body - lambda function for the cilk_for loop body * data - data used by the lambda function * count - trip count for loop * grain - grain size (0 if it should be computed) */ CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data, cilk64_t count, int grain) { // Check for an empty range here as an optimization - don't need to do any // __cilkrts_stack_frame initialization if (count > 0) cilk_for_root(body, data, count, grain); } } // end extern "C" /* End cilk-abi-cilk-for.cpp */