Program Listing for File memcopy_padding.hpp¶
↰ Return to documentation for file (include/ripple/container/memcopy_padding.hpp
)
#ifndef RIPPLE_CONTAINER_MEMCOPY_PADDING_HPP
#define RIPPLE_CONTAINER_MEMCOPY_PADDING_HPP
#include "block_traits.hpp"
namespace ripple {
/*==--- [faces between blocks] ---------------------------------------------==*/
enum class FaceLocation : uint8_t {
start = 0, //< Face at the start of the domain.
end = 1
};
enum class Mapping : int {
domain = 0,
padding = 1
};
template <size_t Dim, FaceLocation Location, Mapping Map = Mapping::domain>
struct CopySpecifier {};
/*==--- [copy specifier aliases] -------------------------------------------==*/
template <FaceLocation Location>
using CopySpecifierX = CopySpecifier<DimX::value, Location>;
template <FaceLocation Location>
using CopySpecifierY = CopySpecifier<DimY::value, Location>;
/*==--- [padding utilites] -------------------------------------------------==*/
struct OffsetParam {
int dimension = -1;
int amount = 0;
int dimension_other = -1;
};
template <typename Block, size_t Dim, FaceLocation Location, Mapping Map>
auto storage_ptr(
Block& block,
CopySpecifier<Dim, Location, Map>,
OffsetParam offset = OffsetParam()) noexcept {
constexpr auto dim = Dimension<Dim>();
constexpr bool pad = Map == Mapping::padding;
int shift = 0;
if constexpr (Location == FaceLocation::start) {
// If a pointer into the padding is required then we need to offset by the
// amount of the padding, since the iterator points to the first valid cell.
shift = (pad ? -1 : 0) * block.padding();
} else {
// If a pointer into the domain is required then we need to subtract the
// padding amount from the size of the domain, since the size of the block
// given the end of the domain (or start of the padding):
shift = block.size(dim) + (pad ? 0 : -1) * block.padding();
}
// Since begin() gives a pointer to the first **internal** cell, we need to
// offset either **into** the internal region, or **away** from it and
// **into** the padding:
auto it = block.begin().offset(dim, shift);
if (offset.dimension != -1) {
it.shift(offset.dimension, offset.amount);
}
if (offset.dimension_other != -1) {
it.shift(offset.dimension_other, offset.amount);
}
if constexpr (is_storage_accessor_v<decltype(it.storage())>) {
return it.storage();
} else {
return it.data();
}
}
/*==--- [1D face padding copy]
* ---------------------------------------------==*/
template <
typename SrcBlock,
typename DstBlock,
FaceLocation SrcLocation,
FaceLocation DstLocation,
block_1d_enable_t<SrcBlock> = 0>
auto memcopy_padding(
const SrcBlock& src_block,
DstBlock& dst_block,
CopySpecifierX<SrcLocation> src_specifier,
CopySpecifierX<DstLocation> dst_specifier) -> void {
if (dst_block.padding() == 0) {
return;
}
using Allocator = typename block_traits_t<SrcBlock>::Allocator;
const auto copy_type = src_block.template get_copy_type<DstBlock>();
const auto copy_size = Allocator::allocation_size(dst_block.padding());
const void* src_ptr = padding_ptr(src_block, src_specifier);
void* dst_ptr = padding_ptr(dst_block, dst_specifier);
// TODO: Add implmentation when there is no cuda ...
ripple_check_cuda_result(cudaMemcpyAsync(
dst_ptr,
src_ptr,
copy_size,
copy_type,
is_device_block_v<SrcBlock> ? src_block.stream()
: is_device_block_v<DstBlock> ? dst_block.stream()
: 0));
}
/*==--- [2D face padding copy] ---------------------------------------------==*/
template <
typename SrcBlock,
typename DstBlock,
size_t Dim,
FaceLocation SrcLocation,
FaceLocation DstLocation,
Mapping SrcMapping,
Mapping DstMapping,
block_2d_enable_t<SrcBlock> = 0>
auto memcopy_padding(
const SrcBlock& src_block,
DstBlock& dst_block,
CopySpecifier<Dim, SrcLocation, SrcMapping> src_specifier,
CopySpecifier<Dim, DstLocation, DstMapping> dst_specifier,
GpuStream stream = 0) -> void {
using Allocator = typename block_traits_t<SrcBlock>::Allocator;
static_assert(Dim <= DimY::value, "Invalid dimension!");
if (dst_block.padding() == 0) {
return;
}
OffsetParam p{
Dim == dimx() ? DimY::value : DimX::value,
-1 * static_cast<int>(src_block.padding())};
constexpr size_t num_types = Allocator::strided_types();
constexpr bool is_accessor =
is_storage_accessor_v<decltype(storage_ptr(src_block, src_specifier, p))>;
std::vector<const void*> src_ptrs;
std::vector<void*> dst_ptrs;
/* If the storage is a storage accessor then it may be strided and have
* multiple types for which the memory needs to be copied, so here we create
* vector of all the pointers to copy, from which we can then do the generic
* implementation. */
if constexpr (is_accessor) {
for (auto* p : storage_ptr(src_block, src_specifier, p).data_ptrs()) {
src_ptrs.push_back(p);
}
for (auto* p : storage_ptr(dst_block, dst_specifier, p).data_ptrs()) {
dst_ptrs.push_back(p);
}
} else {
src_ptrs.push_back(storage_ptr(src_block, src_specifier, p));
dst_ptrs.push_back(storage_ptr(dst_block, dst_specifier, p));
}
const auto type = src_block.template get_copy_type<DstBlock>();
unrolled_for<num_types>([&](auto i) {
constexpr size_t bytes = Allocator::template element_byte_size<i>();
constexpr size_t elems = Allocator::template num_elements<i>();
/* Pitch is always the number of elements (including padding elements)
* multiplied by the number of bytes. */
const size_t pitch = bytes * src_block.pitch(dimx());
/* If copying in x dimension, we just need to copy the padding width,
* otherwise if in the y dimension, we need to copy the whole width of
* the x dimension: */
const size_t width =
bytes * (Dim == dimx() ? src_block.padding() : src_block.pitch(dimx()));
/* If copying in the y dimension it is the opposite as above. We need to
* copy the whole size of the y dimension for the height if the face is
* in the x dimension, otherwise the height is the amount of padding.
*
* For the case that the data is strided, then we are using the pitch for
* a single element, so we need to multiply by the number of elements to
* copy all the data. */
const size_t height =
(Dim == dimx() ? src_block.pitch(dimy()) : src_block.padding()) * elems;
ripple_check_cuda_result(cudaMemcpy2DAsync(
dst_ptrs[i], pitch, src_ptrs[i], pitch, width, height, type, stream));
});
}
template <
typename SrcBlock,
typename DstBlock,
size_t Dim,
FaceLocation SrcLocation,
FaceLocation DstLocation,
Mapping SrcMapping,
Mapping DstMapping,
block_3d_enable_t<SrcBlock> = 0>
auto memcopy_padding(
const SrcBlock& src_block,
DstBlock& dst_block,
CopySpecifier<Dim, SrcLocation, SrcMapping> src_specifier,
CopySpecifier<Dim, DstLocation, DstMapping> dst_specifier,
GpuStream stream = 0) -> void {
using Allocator = typename block_traits_t<SrcBlock>::Allocator;
static_assert(Dim <= DimZ::value, "Invalid dimension!");
if (dst_block.padding() == 0) {
return;
}
OffsetParam p{
(Dim + 1) % 3, -1 * static_cast<int>(src_block.padding()), (Dim + 2) % 3};
constexpr size_t num_types = Allocator::strided_types();
constexpr bool is_accessor =
is_storage_accessor_v<decltype(storage_ptr(src_block, src_specifier, p))>;
std::vector<const void*> src_ptrs;
std::vector<void*> dst_ptrs;
// If the storage is a storage accessor then it may be strided and have
// multiple types for which the memory needs to be copied, so here we create
// vector of all the pointers to copy, from which we can then do the generic
// implementation.
if constexpr (is_accessor) {
for (auto* p : storage_ptr(src_block, src_specifier, p).data_ptrs()) {
src_ptrs.push_back(p);
}
for (auto* p : storage_ptr(dst_block, dst_specifier, p).data_ptrs()) {
dst_ptrs.push_back(p);
}
} else {
src_ptrs.push_back(storage_ptr(src_block, src_specifier, p));
dst_ptrs.push_back(storage_ptr(dst_block, dst_specifier, p));
}
const auto type = src_block.template get_copy_type<DstBlock>();
unrolled_for<num_types>([&](auto i) {
constexpr size_t bytes = Allocator::template element_byte_size<i>();
constexpr size_t elems = Allocator::template num_elements<i>();
// NOTE: This only seems to work for the z dimension, which is strange ...
// need to fix!
// NOTE: Pitch is always the number of elements (including padding elements)
// multiplied by the number of bytes.
/* This pitch here is tricky. At the very least, the pitch is:
*
* - bytes per element * elements in dim x
*
* then, if we are copying in y or z, we multiply by the number of
* elements for the strided case.
*
* Lastly, for the z-case, we want to copy padding number of x-y planes,
* for which the data in the plane is contiguous, and each plane for the
* padding is also contiguous, so we can just set the pitch to be the
* entire region to copy, which is better for performance.
*/
const size_t pitch = bytes * src_block.pitch(dimx()) *
(Dim == dimx() ? 1 : elems) *
(Dim == dimz() ? src_block.pitch(dimy()) : 1);
/* If copying in x dimension, we just need to copy the padding width,
* otherwise if in the y dimension, we need to copy the whole width of
* the x dimension: */
const size_t width =
Dim == dimz() ? pitch
: bytes * (Dim == dimx() ? src_block.padding()
: src_block.pitch(dimx()) * elems);
/* If copying in the y dimension it is the opposite as above. We need to
* copy the whole size of the y dimension for the height if the face is
* in the x dimension, otherwise the height is the amount of padding.
*
* For the case that the data is strided, then we are using the pitch for
* a single element, so we need to multiply by the number of elements to
* copy all the data. */
const size_t height =
Dim == dimz() ? src_block.padding()
: src_block.pitch(dimz()) *
(Dim != dimx() ? 1 : src_block.pitch(dimy()) * elems);
ripple_check_cuda_result(ripple_if_cuda(cudaMemcpy2DAsync(
dst_ptrs[i], pitch, src_ptrs[i], pitch, width, height, type, stream)));
});
}
} // namespace ripple
#endif // RIPPLE_CONTAINER_MEMCOPY_PADDING_HPP