Grappa  r3821, hash 22cd626d567a91ead5b23302066d1e9469f45c66
Loops

Macros

#define FORALL_HERE_OVERLOAD(...)
 Overload. More...
 
#define FORALL_OVERLOAD(...)
 

Functions

GlobalCompletionEvent & Grappa::default_gce ()
 
template<SyncMode S = SyncMode::Blocking, TaskMode B = TaskMode::Bound, GlobalCompletionEvent * GCE = nullptr, int64_t Threshold = impl::USE_LOOP_THRESHOLD_FLAG, typename F = decltype(nullptr)>
void Grappa::forall_here (int64_t start, int64_t iters, F loop_body)
 
 Grappa::FORALL_HERE_OVERLOAD (TaskMode B, SyncMode S=SyncMode::Blocking, GlobalCompletionEvent *GCE=nullptr, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG)
 
 Grappa::FORALL_HERE_OVERLOAD (SyncMode S, GlobalCompletionEvent *GCE, TaskMode B=TaskMode::Bound, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG)
 
 Grappa::FORALL_HERE_OVERLOAD (SyncMode S, GlobalCompletionEvent *GCE, int64_t Threshold, TaskMode B=TaskMode::Bound)
 
 Grappa::FORALL_HERE_OVERLOAD (GlobalCompletionEvent *GCE, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG, TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking)
 
 Grappa::FORALL_HERE_OVERLOAD (int64_t Threshold, GlobalCompletionEvent *GCE=nullptr, TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking)
 
 Grappa::FORALL_OVERLOAD (TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking, GlobalCompletionEvent *C=&impl::local_gce, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG)
 
 Grappa::FORALL_OVERLOAD (SyncMode S, TaskMode B=TaskMode::Bound, GlobalCompletionEvent *C=&impl::local_gce, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG)
 
 Grappa::FORALL_OVERLOAD (GlobalCompletionEvent *C, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG, TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking)
 
 Grappa::FORALL_OVERLOAD (int64_t Threshold, GlobalCompletionEvent *C=&impl::local_gce, TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking)
 
 Grappa::FORALL_OVERLOAD (TaskMode B, GlobalCompletionEvent *C, SyncMode S=SyncMode::Blocking, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG)
 
template<typename T >
std::pair< Core, CoreGrappa::cores_with_elements (GlobalAddress< T > base, size_t nelem)
 Return range of cores that have elements for the given linear address range. More...
 
template<GlobalCompletionEvent * GCE = &impl::local_gce, int64_t Threshold = impl::USE_LOOP_THRESHOLD_FLAG, typename T = decltype(nullptr), typename F = decltype(nullptr)>
void Grappa::on_cores_localized_async (GlobalAddress< T > base, int64_t nelems, F do_on_core)
 Run privateTasks on each core that contains elements of the given region of global memory. More...
 
template<TaskMode B = TaskMode::Bound, SyncMode S = SyncMode::Blocking, GlobalCompletionEvent * GCE = &impl::local_gce, int64_t Threshold = impl::USE_LOOP_THRESHOLD_FLAG, typename T = decltype(nullptr), typename F = decltype(nullptr)>
void Grappa::forall (GlobalAddress< T > base, int64_t nelems, F loop_body)
 Parallel loop over a global array. More...
 

Detailed Description

Macro Definition Documentation

#define FORALL_HERE_OVERLOAD (   ...)
Value:
template< __VA_ARGS__, typename F = decltype(nullptr) > \
void forall_here(int64_t start, int64_t iters, F loop_body) { \
impl::forall_here<B,S,GCE,Threshold>(start, iters, loop_body); \
}
void forall_here(int64_t start, int64_t iters, F loop_body)

Overload.

Definition at line 202 of file ParallelLoop.hpp.

#define FORALL_OVERLOAD (   ...)
Value:
template< __VA_ARGS__, typename F = decltype(nullptr) > \
void forall(int64_t start, int64_t iters, F loop_body) { \
impl::forall<B,S,C,Threshold>(start, iters, loop_body, &F::operator()); \
}
void forall(GlobalAddress< T > base, int64_t nelems, F loop_body)
Parallel loop over a global array.

Definition at line 288 of file ParallelLoop.hpp.

Function Documentation

template<typename T >
std::pair<Core,Core> Grappa::cores_with_elements ( GlobalAddress< T >  base,
size_t  nelem 
)

Return range of cores that have elements for the given linear address range.

Definition at line 340 of file ParallelLoop.hpp.

GlobalCompletionEvent& Grappa::default_gce ( )
inline

Definition at line 71 of file ParallelLoop.hpp.

template<TaskMode B = TaskMode::Bound, SyncMode S = SyncMode::Blocking, GlobalCompletionEvent * GCE = &impl::local_gce, int64_t Threshold = impl::USE_LOOP_THRESHOLD_FLAG, typename T = decltype(nullptr), typename F = decltype(nullptr)>
void Grappa::forall ( GlobalAddress< T >  base,
int64_t  nelems,
loop_body 
)

Parallel loop over a global array.

Overload for specifying GCE only.

Overload to allow using default GCE but specifying threshold.

Overload for specifying just SyncMode (or SyncMode first)

Spawned from a single core, fans out and runs tasks on elements that are local to each core.

Subject to "may-parallelism",

See also
loop_threshold.

Takes an optional pointer to a global static GlobalCompletionEvent as a template parameter to allow for programmer-specified task joining (to potentially allow more than one in flight simultaneously, though this call is itself sync.

takes a lambda/functor that operates on a range of iterations: void(int64_t first_index, int64_t niters, T * first_element)

Warning
You cannot simply increment first_index niters times and get the correct global index because a single task may span more than one block.

Example:

// GlobalCompletionEvent gce declared in global scope
forall<&gce>(array, N, [dest](int64_t start, int64_t niters, double * first){
for (int64_t i=0; i<niters; i++) {
delegate::write<async,&gce>(dest+start+i, 2.0*first+i);
}
});

Alternatively, forall can take a lambda/functor with signature: void(int64_t index, T& element) (internally wraps this call in a loop and passes to the other version of forall)

This is meant to make it easy to make a loop where you don't care about amortizing anything for a single task. If you would like to do something that will be used by multiple iterations, use the other version of Grappa::forall that takes a lambda that operates on a range.

Example:

// GlobalCompletionEvent gce declared in global scope
forall<&gce>(array, N, [dest](int64_t i, double& v){
delegate::write<async,&gce>(dest+i, 2.0*v);
});

Definition at line 509 of file ParallelLoop.hpp.

template<SyncMode S = SyncMode::Blocking, TaskMode B = TaskMode::Bound, GlobalCompletionEvent * GCE = nullptr, int64_t Threshold = impl::USE_LOOP_THRESHOLD_FLAG, typename F = decltype(nullptr)>
void Grappa::forall_here ( int64_t  start,
int64_t  iters,
loop_body 
)

Definition at line 197 of file ParallelLoop.hpp.

Grappa::FORALL_HERE_OVERLOAD ( TaskMode  B,
SyncMode  S = SyncMode::Blocking,
GlobalCompletionEvent GCE = nullptr,
int64_t  Threshold = impl::USE_LOOP_THRESHOLD_FLAG 
)
Grappa::FORALL_HERE_OVERLOAD ( SyncMode  S,
GlobalCompletionEvent GCE,
TaskMode  B = TaskMode::Bound,
int64_t  Threshold = impl::USE_LOOP_THRESHOLD_FLAG 
)
Grappa::FORALL_HERE_OVERLOAD ( SyncMode  S,
GlobalCompletionEvent GCE,
int64_t  Threshold,
TaskMode  B = TaskMode::Bound 
)
Grappa::FORALL_HERE_OVERLOAD ( GlobalCompletionEvent GCE,
int64_t  Threshold = impl::USE_LOOP_THRESHOLD_FLAG,
TaskMode  B = TaskMode::Bound,
SyncMode  S = SyncMode::Blocking 
)
Grappa::FORALL_HERE_OVERLOAD ( int64_t  Threshold,
GlobalCompletionEvent GCE = nullptr,
TaskMode  B = TaskMode::Bound,
SyncMode  S = SyncMode::Blocking 
)
Grappa::FORALL_OVERLOAD ( TaskMode  B = TaskMode::Bound,
SyncMode  S = SyncMode::Blocking,
GlobalCompletionEvent C = &impl::local_gce,
int64_t  Threshold = impl::USE_LOOP_THRESHOLD_FLAG 
)
Grappa::FORALL_OVERLOAD ( SyncMode  S,
TaskMode  B = TaskMode::Bound,
GlobalCompletionEvent C = &impl::local_gce,
int64_t  Threshold = impl::USE_LOOP_THRESHOLD_FLAG 
)
Grappa::FORALL_OVERLOAD ( GlobalCompletionEvent C,
int64_t  Threshold = impl::USE_LOOP_THRESHOLD_FLAG,
TaskMode  B = TaskMode::Bound,
SyncMode  S = SyncMode::Blocking 
)
Grappa::FORALL_OVERLOAD ( int64_t  Threshold,
GlobalCompletionEvent C = &impl::local_gce,
TaskMode  B = TaskMode::Bound,
SyncMode  S = SyncMode::Blocking 
)
Grappa::FORALL_OVERLOAD ( TaskMode  B,
GlobalCompletionEvent C,
SyncMode  S = SyncMode::Blocking,
int64_t  Threshold = impl::USE_LOOP_THRESHOLD_FLAG 
)
template<GlobalCompletionEvent * GCE = &impl::local_gce, int64_t Threshold = impl::USE_LOOP_THRESHOLD_FLAG, typename T = decltype(nullptr), typename F = decltype(nullptr)>
void Grappa::on_cores_localized_async ( GlobalAddress< T >  base,
int64_t  nelems,
do_on_core 
)

Run privateTasks on each core that contains elements of the given region of global memory.

do_on_core: void(T* local_base, size_t nlocal) Internally creates privateTask with 2*8-byte words, so do_on_core can be 8 bytes and not cause heap allocation.

Definition at line 374 of file ParallelLoop.hpp.