Macros | |
#define | FORALL_HERE_OVERLOAD(...) |
Overload. More... | |
#define | FORALL_OVERLOAD(...) |
Functions | |
GlobalCompletionEvent & | Grappa::default_gce () |
template<SyncMode S = SyncMode::Blocking, TaskMode B = TaskMode::Bound, GlobalCompletionEvent * GCE = nullptr, int64_t Threshold = impl::USE_LOOP_THRESHOLD_FLAG, typename F = decltype(nullptr)> | |
void | Grappa::forall_here (int64_t start, int64_t iters, F loop_body) |
Grappa::FORALL_HERE_OVERLOAD (TaskMode B, SyncMode S=SyncMode::Blocking, GlobalCompletionEvent *GCE=nullptr, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG) | |
Grappa::FORALL_HERE_OVERLOAD (SyncMode S, GlobalCompletionEvent *GCE, TaskMode B=TaskMode::Bound, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG) | |
Grappa::FORALL_HERE_OVERLOAD (SyncMode S, GlobalCompletionEvent *GCE, int64_t Threshold, TaskMode B=TaskMode::Bound) | |
Grappa::FORALL_HERE_OVERLOAD (GlobalCompletionEvent *GCE, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG, TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking) | |
Grappa::FORALL_HERE_OVERLOAD (int64_t Threshold, GlobalCompletionEvent *GCE=nullptr, TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking) | |
Grappa::FORALL_OVERLOAD (TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking, GlobalCompletionEvent *C=&impl::local_gce, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG) | |
Grappa::FORALL_OVERLOAD (SyncMode S, TaskMode B=TaskMode::Bound, GlobalCompletionEvent *C=&impl::local_gce, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG) | |
Grappa::FORALL_OVERLOAD (GlobalCompletionEvent *C, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG, TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking) | |
Grappa::FORALL_OVERLOAD (int64_t Threshold, GlobalCompletionEvent *C=&impl::local_gce, TaskMode B=TaskMode::Bound, SyncMode S=SyncMode::Blocking) | |
Grappa::FORALL_OVERLOAD (TaskMode B, GlobalCompletionEvent *C, SyncMode S=SyncMode::Blocking, int64_t Threshold=impl::USE_LOOP_THRESHOLD_FLAG) | |
template<typename T > | |
std::pair< Core, Core > | Grappa::cores_with_elements (GlobalAddress< T > base, size_t nelem) |
Return range of cores that have elements for the given linear address range. More... | |
template<GlobalCompletionEvent * GCE = &impl::local_gce, int64_t Threshold = impl::USE_LOOP_THRESHOLD_FLAG, typename T = decltype(nullptr), typename F = decltype(nullptr)> | |
void | Grappa::on_cores_localized_async (GlobalAddress< T > base, int64_t nelems, F do_on_core) |
Run privateTasks on each core that contains elements of the given region of global memory. More... | |
template<TaskMode B = TaskMode::Bound, SyncMode S = SyncMode::Blocking, GlobalCompletionEvent * GCE = &impl::local_gce, int64_t Threshold = impl::USE_LOOP_THRESHOLD_FLAG, typename T = decltype(nullptr), typename F = decltype(nullptr)> | |
void | Grappa::forall (GlobalAddress< T > base, int64_t nelems, F loop_body) |
Parallel loop over a global array. More... | |
#define FORALL_HERE_OVERLOAD | ( | ... | ) |
Overload.
Definition at line 202 of file ParallelLoop.hpp.
#define FORALL_OVERLOAD | ( | ... | ) |
Definition at line 288 of file ParallelLoop.hpp.
std::pair<Core,Core> Grappa::cores_with_elements | ( | GlobalAddress< T > | base, |
size_t | nelem | ||
) |
Return range of cores that have elements for the given linear address range.
Definition at line 340 of file ParallelLoop.hpp.
|
inline |
Definition at line 71 of file ParallelLoop.hpp.
void Grappa::forall | ( | GlobalAddress< T > | base, |
int64_t | nelems, | ||
F | loop_body | ||
) |
Parallel loop over a global array.
Overload for specifying GCE only.
Overload to allow using default GCE but specifying threshold.
Overload for specifying just SyncMode (or SyncMode first)
Spawned from a single core, fans out and runs tasks on elements that are local to each core.
Subject to "may-parallelism",
loop_threshold
.Takes an optional pointer to a global static GlobalCompletionEvent
as a template parameter to allow for programmer-specified task joining (to potentially allow more than one in flight simultaneously, though this call is itself sync.
takes a lambda/functor that operates on a range of iterations: void(int64_t first_index, int64_t niters, T * first_element)
first_index
niters
times and get the correct global index because a single task may span more than one block.Example:
Alternatively, forall can take a lambda/functor with signature: void(int64_t index, T& element) (internally wraps this call in a loop and passes to the other version of forall)
This is meant to make it easy to make a loop where you don't care about amortizing anything for a single task. If you would like to do something that will be used by multiple iterations, use the other version of Grappa::forall that takes a lambda that operates on a range.
Example:
Definition at line 509 of file ParallelLoop.hpp.
void Grappa::forall_here | ( | int64_t | start, |
int64_t | iters, | ||
F | loop_body | ||
) |
Definition at line 197 of file ParallelLoop.hpp.
Grappa::FORALL_HERE_OVERLOAD | ( | TaskMode | B, |
SyncMode | S = SyncMode::Blocking , |
||
GlobalCompletionEvent * | GCE = nullptr , |
||
int64_t | Threshold = impl::USE_LOOP_THRESHOLD_FLAG |
||
) |
Grappa::FORALL_HERE_OVERLOAD | ( | SyncMode | S, |
GlobalCompletionEvent * | GCE, | ||
TaskMode | B = TaskMode::Bound , |
||
int64_t | Threshold = impl::USE_LOOP_THRESHOLD_FLAG |
||
) |
Grappa::FORALL_HERE_OVERLOAD | ( | SyncMode | S, |
GlobalCompletionEvent * | GCE, | ||
int64_t | Threshold, | ||
TaskMode | B = TaskMode::Bound |
||
) |
Grappa::FORALL_HERE_OVERLOAD | ( | GlobalCompletionEvent * | GCE, |
int64_t | Threshold = impl::USE_LOOP_THRESHOLD_FLAG , |
||
TaskMode | B = TaskMode::Bound , |
||
SyncMode | S = SyncMode::Blocking |
||
) |
Grappa::FORALL_HERE_OVERLOAD | ( | int64_t | Threshold, |
GlobalCompletionEvent * | GCE = nullptr , |
||
TaskMode | B = TaskMode::Bound , |
||
SyncMode | S = SyncMode::Blocking |
||
) |
Grappa::FORALL_OVERLOAD | ( | TaskMode | B = TaskMode::Bound , |
SyncMode | S = SyncMode::Blocking , |
||
GlobalCompletionEvent * | C = &impl::local_gce , |
||
int64_t | Threshold = impl::USE_LOOP_THRESHOLD_FLAG |
||
) |
Grappa::FORALL_OVERLOAD | ( | SyncMode | S, |
TaskMode | B = TaskMode::Bound , |
||
GlobalCompletionEvent * | C = &impl::local_gce , |
||
int64_t | Threshold = impl::USE_LOOP_THRESHOLD_FLAG |
||
) |
Grappa::FORALL_OVERLOAD | ( | GlobalCompletionEvent * | C, |
int64_t | Threshold = impl::USE_LOOP_THRESHOLD_FLAG , |
||
TaskMode | B = TaskMode::Bound , |
||
SyncMode | S = SyncMode::Blocking |
||
) |
Grappa::FORALL_OVERLOAD | ( | int64_t | Threshold, |
GlobalCompletionEvent * | C = &impl::local_gce , |
||
TaskMode | B = TaskMode::Bound , |
||
SyncMode | S = SyncMode::Blocking |
||
) |
Grappa::FORALL_OVERLOAD | ( | TaskMode | B, |
GlobalCompletionEvent * | C, | ||
SyncMode | S = SyncMode::Blocking , |
||
int64_t | Threshold = impl::USE_LOOP_THRESHOLD_FLAG |
||
) |
void Grappa::on_cores_localized_async | ( | GlobalAddress< T > | base, |
int64_t | nelems, | ||
F | do_on_core | ||
) |
Run privateTasks on each core that contains elements of the given region of global memory.
do_on_core: void(T* local_base, size_t nlocal) Internally creates privateTask with 2*8-byte words, so do_on_core can be 8 bytes and not cause heap allocation.
Definition at line 374 of file ParallelLoop.hpp.