Program Listing for File memcopy_padding.hpp¶

↰ Return to documentation for file (include/ripple/container/memcopy_padding.hpp)
#ifndef RIPPLE_CONTAINER_MEMCOPY_PADDING_HPP
#define RIPPLE_CONTAINER_MEMCOPY_PADDING_HPP

#include "block_traits.hpp"

namespace ripple {

/*==--- [faces between blocks] ---------------------------------------------==*/

enum class FaceLocation : uint8_t {
  start = 0, //< Face at the start of the domain.
  end   = 1
};

enum class Mapping : int {
  domain  = 0,
  padding = 1
};

template <size_t Dim, FaceLocation Location, Mapping Map = Mapping::domain>
struct CopySpecifier {};

/*==--- [copy specifier aliases] -------------------------------------------==*/

template <FaceLocation Location>
using CopySpecifierX = CopySpecifier<DimX::value, Location>;

template <FaceLocation Location>
using CopySpecifierY = CopySpecifier<DimY::value, Location>;

/*==--- [padding utilites] -------------------------------------------------==*/

struct OffsetParam {
  int dimension       = -1;
  int amount          = 0;
  int dimension_other = -1;
};

template <typename Block, size_t Dim, FaceLocation Location, Mapping Map>
auto storage_ptr(
  Block& block,
  CopySpecifier<Dim, Location, Map>,
  OffsetParam offset = OffsetParam()) noexcept {
  constexpr auto dim   = Dimension<Dim>();
  constexpr bool pad   = Map == Mapping::padding;
  int            shift = 0;
  if constexpr (Location == FaceLocation::start) {
    // If a pointer into the padding is required then we need to offset by the
    // amount of the padding, since the iterator points to the first valid cell.
    shift = (pad ? -1 : 0) * block.padding();
  } else {
    // If a pointer into the domain is required then we need to subtract the
    // padding amount from the size of the domain, since the size of the block
    // given the end of the domain (or start of the padding):
    shift = block.size(dim) + (pad ? 0 : -1) * block.padding();
  }

  // Since begin() gives a pointer to the first **internal** cell, we need to
  // offset either **into** the internal region, or **away** from it and
  // **into** the padding:
  auto it = block.begin().offset(dim, shift);
  if (offset.dimension != -1) {
    it.shift(offset.dimension, offset.amount);
  }
  if (offset.dimension_other != -1) {
    it.shift(offset.dimension_other, offset.amount);
  }

  if constexpr (is_storage_accessor_v<decltype(it.storage())>) {
    return it.storage();
  } else {
    return it.data();
  }
}

/*==--- [1D face padding copy]
 * ---------------------------------------------==*/

template <
  typename SrcBlock,
  typename DstBlock,
  FaceLocation SrcLocation,
  FaceLocation DstLocation,
  block_1d_enable_t<SrcBlock> = 0>
auto memcopy_padding(
  const SrcBlock&             src_block,
  DstBlock&                   dst_block,
  CopySpecifierX<SrcLocation> src_specifier,
  CopySpecifierX<DstLocation> dst_specifier) -> void {
  if (dst_block.padding() == 0) {
    return;
  }
  using Allocator = typename block_traits_t<SrcBlock>::Allocator;

  const auto  copy_type = src_block.template get_copy_type<DstBlock>();
  const auto  copy_size = Allocator::allocation_size(dst_block.padding());
  const void* src_ptr   = padding_ptr(src_block, src_specifier);
  void*       dst_ptr   = padding_ptr(dst_block, dst_specifier);

  // TODO: Add implmentation when there is no cuda ...
  ripple_check_cuda_result(cudaMemcpyAsync(
    dst_ptr,
    src_ptr,
    copy_size,
    copy_type,
    is_device_block_v<SrcBlock>   ? src_block.stream()
    : is_device_block_v<DstBlock> ? dst_block.stream()
                                  : 0));
}

/*==--- [2D face padding copy] ---------------------------------------------==*/

template <
  typename SrcBlock,
  typename DstBlock,
  size_t       Dim,
  FaceLocation SrcLocation,
  FaceLocation DstLocation,
  Mapping      SrcMapping,
  Mapping      DstMapping,
  block_2d_enable_t<SrcBlock> = 0>
auto memcopy_padding(
  const SrcBlock&                             src_block,
  DstBlock&                                   dst_block,
  CopySpecifier<Dim, SrcLocation, SrcMapping> src_specifier,
  CopySpecifier<Dim, DstLocation, DstMapping> dst_specifier,
  GpuStream                                   stream = 0) -> void {
  using Allocator = typename block_traits_t<SrcBlock>::Allocator;
  static_assert(Dim <= DimY::value, "Invalid dimension!");
  if (dst_block.padding() == 0) {
    return;
  }

  OffsetParam p{
    Dim == dimx() ? DimY::value : DimX::value,
    -1 * static_cast<int>(src_block.padding())};

  constexpr size_t num_types = Allocator::strided_types();
  constexpr bool   is_accessor =
    is_storage_accessor_v<decltype(storage_ptr(src_block, src_specifier, p))>;

  std::vector<const void*> src_ptrs;
  std::vector<void*>       dst_ptrs;
  /* If the storage is a storage accessor then it may be strided and have
   * multiple types for which the memory needs to be copied, so here we create
   * vector of all the pointers to copy, from which we can then do the generic
   * implementation. */
  if constexpr (is_accessor) {
    for (auto* p : storage_ptr(src_block, src_specifier, p).data_ptrs()) {
      src_ptrs.push_back(p);
    }
    for (auto* p : storage_ptr(dst_block, dst_specifier, p).data_ptrs()) {
      dst_ptrs.push_back(p);
    }
  } else {
    src_ptrs.push_back(storage_ptr(src_block, src_specifier, p));
    dst_ptrs.push_back(storage_ptr(dst_block, dst_specifier, p));
  }

  const auto type = src_block.template get_copy_type<DstBlock>();
  unrolled_for<num_types>([&](auto i) {
    constexpr size_t bytes = Allocator::template element_byte_size<i>();
    constexpr size_t elems = Allocator::template num_elements<i>();

    /* Pitch is always the number of elements (including padding elements)
     * multiplied by the number of bytes. */
    const size_t pitch = bytes * src_block.pitch(dimx());

    /* If copying in x dimension, we just need to copy the padding width,
     * otherwise if in the y dimension, we need to copy the whole width of
     * the x dimension: */
    const size_t width =
      bytes * (Dim == dimx() ? src_block.padding() : src_block.pitch(dimx()));

    /* If copying in the y dimension it is the opposite as above. We need to
     * copy the whole size of the y dimension for the height if the face is
     * in the x dimension, otherwise the height is the amount of padding.
     *
     * For the case that the data is strided, then we are using the pitch for
     * a single element, so we need to multiply by the number of elements to
     * copy all the data. */
    const size_t height =
      (Dim == dimx() ? src_block.pitch(dimy()) : src_block.padding()) * elems;

    ripple_check_cuda_result(cudaMemcpy2DAsync(
      dst_ptrs[i], pitch, src_ptrs[i], pitch, width, height, type, stream));
  });
}

template <
  typename SrcBlock,
  typename DstBlock,
  size_t       Dim,
  FaceLocation SrcLocation,
  FaceLocation DstLocation,
  Mapping      SrcMapping,
  Mapping      DstMapping,
  block_3d_enable_t<SrcBlock> = 0>
auto memcopy_padding(
  const SrcBlock&                             src_block,
  DstBlock&                                   dst_block,
  CopySpecifier<Dim, SrcLocation, SrcMapping> src_specifier,
  CopySpecifier<Dim, DstLocation, DstMapping> dst_specifier,
  GpuStream                                   stream = 0) -> void {
  using Allocator = typename block_traits_t<SrcBlock>::Allocator;
  static_assert(Dim <= DimZ::value, "Invalid dimension!");
  if (dst_block.padding() == 0) {
    return;
  }

  OffsetParam p{
    (Dim + 1) % 3, -1 * static_cast<int>(src_block.padding()), (Dim + 2) % 3};

  constexpr size_t num_types = Allocator::strided_types();
  constexpr bool   is_accessor =
    is_storage_accessor_v<decltype(storage_ptr(src_block, src_specifier, p))>;

  std::vector<const void*> src_ptrs;
  std::vector<void*>       dst_ptrs;
  // If the storage is a storage accessor then it may be strided and have
  // multiple types for which the memory needs to be copied, so here we create
  // vector of all the pointers to copy, from which we can then do the generic
  // implementation.
  if constexpr (is_accessor) {
    for (auto* p : storage_ptr(src_block, src_specifier, p).data_ptrs()) {
      src_ptrs.push_back(p);
    }
    for (auto* p : storage_ptr(dst_block, dst_specifier, p).data_ptrs()) {
      dst_ptrs.push_back(p);
    }
  } else {
    src_ptrs.push_back(storage_ptr(src_block, src_specifier, p));
    dst_ptrs.push_back(storage_ptr(dst_block, dst_specifier, p));
  }

  const auto type = src_block.template get_copy_type<DstBlock>();
  unrolled_for<num_types>([&](auto i) {
    constexpr size_t bytes = Allocator::template element_byte_size<i>();
    constexpr size_t elems = Allocator::template num_elements<i>();

    // NOTE: This only seems to work for the z dimension, which is strange ...
    //       need to fix!

    // NOTE: Pitch is always the number of elements (including padding elements)
    //        multiplied by the number of bytes.

    /* This pitch here is tricky. At the very least, the pitch is:
     *
     * - bytes per element * elements in dim x
     *
     * then, if we are copying in y or z, we multiply by the number of
     * elements for the strided case.
     *
     * Lastly, for the z-case, we want to copy padding number of x-y planes,
     * for which the data in the plane is contiguous, and each plane for the
     * padding is also contiguous, so we can just set the pitch to be the
     * entire region to copy, which is better for performance.
     */
    const size_t pitch = bytes * src_block.pitch(dimx()) *
                         (Dim == dimx() ? 1 : elems) *
                         (Dim == dimz() ? src_block.pitch(dimy()) : 1);

    /* If copying in x dimension, we just need to copy the padding width,
     * otherwise if in the y dimension, we need to copy the whole width of
     * the x dimension: */
    const size_t width =
      Dim == dimz() ? pitch
                    : bytes * (Dim == dimx() ? src_block.padding()
                                             : src_block.pitch(dimx()) * elems);

    /* If copying in the y dimension it is the opposite as above. We need to
     * copy the whole size of the y dimension for the height if the face is
     * in the x dimension, otherwise the height is the amount of padding.
     *
     * For the case that the data is strided, then we are using the pitch for
     * a single element, so we need to multiply by the number of elements to
     * copy all the data. */
    const size_t height =
      Dim == dimz() ? src_block.padding()
                    : src_block.pitch(dimz()) *
                        (Dim != dimx() ? 1 : src_block.pitch(dimy()) * elems);

    ripple_check_cuda_result(ripple_if_cuda(cudaMemcpy2DAsync(
      dst_ptrs[i], pitch, src_ptrs[i], pitch, width, height, type, stream)));
  });
}

} // namespace ripple

#endif // RIPPLE_CONTAINER_MEMCOPY_PADDING_HPP
Program Listing for File memcopy_padding.hpp¶

Docs

Tutorials

Examples