include/cuda/experimental/__stf/internal/slice.cuh

File members: include/cuda/experimental/__stf/internal/slice.cuh

//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#pragma once

#include <cuda/__cccl_config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header

#include <cuda/std/mdspan>

#include <cuda/experimental/__stf/utility/core.cuh>
#include <cuda/experimental/__stf/utility/dimensions.cuh>
#include <cuda/experimental/__stf/utility/hash.cuh>
#include <cuda/experimental/__stf/utility/memory.cuh>

#include <iostream>

namespace cuda::experimental::stf
{
using ::cuda::std::mdspan;
}

namespace cuda::experimental::stf
{

template <typename T>
class shape_of;

#if defined(CUDASTF_BOUNDSCHECK) && defined(NDEBUG)
#  error "CUDASTF_BOUNDSCHECK requires that NDEBUG is not defined."
#endif

struct layout_stride : ::cuda::std::layout_stride
{
  template <class Extents>
  struct mapping : ::cuda::std::layout_stride::mapping<Extents>
  {
    constexpr mapping() = default;

    template <typename... A>
    constexpr _CCCL_HOST_DEVICE mapping(A&&... a)
        : ::cuda::std::layout_stride::mapping<Extents>(::std::forward<A>(a)...)
    {}

    template <typename... is_t>
    constexpr _CCCL_HOST_DEVICE auto operator()(is_t&&... is) const
    {
#ifdef CUDASTF_BOUNDSCHECK
      each_in_pack(
        [&](auto r, const auto& i) {
          _CCCL_ASSERT(i < this->extents().extent(r), "Index out of bounds.");
        },
        is...);
#endif
      return ::cuda::std::layout_stride::mapping<Extents>::operator()(::std::forward<is_t>(is)...);
    }
  };
};

template <typename T, size_t dimensions = 1>
using slice = mdspan<T, ::cuda::std::dextents<size_t, dimensions>, layout_stride>;

template <typename S>
size_t contiguous_dims(const S& span)
{
  if (span.rank() < 2)
  {
    return span.rank();
  }

  // Repeat until size != stride
  size_t contiguous_dims = 1;
  size_t prod_strides    = 1;
  while (contiguous_dims < span.rank()
         && span.extent(contiguous_dims - 1) * prod_strides == span.stride(contiguous_dims))
  {
    prod_strides *= span.stride(contiguous_dims);
    contiguous_dims++;
  }

  assert(contiguous_dims <= span.rank());
  return contiguous_dims;
}

template <typename ElementType, typename... Extents, typename... Strides>
_CCCL_HOST_DEVICE auto make_slice(ElementType* data, const ::std::tuple<Extents...>& extents, const Strides&... strides)
{
  static_assert(sizeof...(Extents) == sizeof...(Strides) + 1);
  using Result = slice<ElementType, sizeof...(Extents)>;
  auto sizes   = ::std::apply(
    [&](auto&&... e) {
      return ::cuda::std::array<size_t, sizeof...(Extents)>{size_t(e)...};
    },
    extents);
  ::cuda::std::array<size_t, Result::rank()> mdspan_strides{1, size_t(strides)...};
  return Result(data, typename Result::mapping_type(sizes, mdspan_strides));
}

template <typename T, typename... Extents>
_CCCL_HOST_DEVICE auto make_slice(T* data, const Extents&... extents)
{
  using Result = slice<T, sizeof...(Extents)>;
  static_assert(sizeof...(Extents) == Result::rank());

  if constexpr (sizeof...(Extents) == 0)
  {
    return Result(data, typename Result::mapping_type());
  }
  else
  {
    ::cuda::std::array<size_t, Result::rank()> sizes{size_t(extents)...}, strides;
    for (size_t i = 1; i < strides.size(); ++i)
    {
      strides[i] = sizes[i - 1];
    }

    strides[0] = sizes.size() != 0 && sizes.begin()[0] != 0;
    for (size_t i = 2; i < strides.size(); ++i)
    {
      strides.begin()[i] *= strides.begin()[i - 1];
    }
    return Result(data, typename Result::mapping_type(sizes, strides));
  }
}

namespace reserved
{

template <typename View, typename... Whatevs>
auto make_mdview(Whatevs&&... whatevs)
{
  return make_slice(::std::forward<Whatevs>(whatevs)...);
}

} // namespace reserved

#ifdef UNITTESTED_FILE
#  ifdef STF_HAS_UNITTEST_WITH_ARGS
UNITTEST("slice usual suspects: default ctor, copy, move, assign", (slice<double, 2>()))
{
  using View = ::std::remove_reference_t<decltype(unittest_param)>;
  UNITTEST("should work for several sizes", 1u, 13u)
  {
    View s;
    EXPECT(s.extent(0) == 0);
    EXPECT(s.extent(1) == 0);
    EXPECT(s.size() == 0);

    double data[200 * 100];
    s = reserved::make_mdview<View>(data, ::std::make_tuple(200, unittest_param), 200);
    EXPECT(s.extent(0) == 200);
    EXPECT(s.extent(1) == unittest_param);
    EXPECT(s.size() == 200 * unittest_param);
  };
};

UNITTEST("2D slice basics", (slice<double, 2>()))
{
  using View = ::std::remove_reference_t<decltype(unittest_param)>;
  // Bidimensional array of 200 rows of 100 elements each
  double data[200 * 100];
  // Access the first 13 elements of each row in that array
  auto s = reserved::make_mdview<View>(data, ::std::make_tuple(200, 13), 200);
  EXPECT(s.extent(0) == 200);
  EXPECT(s.extent(1) == 13);
  EXPECT(s.size() == 200 * 13);
  EXPECT(&s(0, 0) == data);
  EXPECT(&s(1, 0) == data + 1);
  EXPECT(&s(2, 0) == data + 2);
  EXPECT(&s(2, 3) == data + 2 + 3 * 200);
};

UNITTEST("3D slice basics", (slice<double, 3>()))
{
  using View = ::std::remove_reference_t<decltype(unittest_param)>;
  // 3-dimensional array of 200 by 100 by 27
  double data[14 * 27 * 300];
  // Access the first 13*14 elements of each row in that array
  auto s = reserved::make_mdview<View>(data, ::std::tuple{200, 13, 14}, 300, 27 * 300);
  EXPECT(s.extent(0) == 200);
  EXPECT(s.extent(1) == 13);
  EXPECT(s.extent(2) == 14);
  EXPECT(s.size() == 200 * 13 * 14);
  EXPECT(&s(0, 0, 0) == data);
  EXPECT(&s(1, 0, 0) == data + 1);
  EXPECT(&s(2, 3, 0) == data + 2 + 3 * 300);
  EXPECT(&s(3, 5, 7) == data + 3 + 5 * 300 + 7 * 27 * 300);
};

UNITTEST("2D tiles", (slice<int, 2>()))
{
  using View = ::std::remove_reference_t<decltype(unittest_param)>;
  // 6x4 matrix, 4 tiles of size 3x2, make sure we touch all entries of the
  // original matrix once when iterating over the tiles
  int data[6 * 4] = {0};
  auto s00        = reserved::make_mdview<View>(data + 0, ::std::tuple{3, 2}, 6);
  auto s10        = reserved::make_mdview<View>(data + 3, ::std::tuple{3, 2}, 6);
  auto s01        = reserved::make_mdview<View>(data + 12, ::std::tuple{3, 2}, 6);
  auto s11        = reserved::make_mdview<View>(data + 15, ::std::tuple{3, 2}, 6);

  for (size_t j = 0; j < s00.extent(1); j++)
  {
    for (size_t i = 0; i < s00.extent(0); i++)
    {
      s00(i, j) = 42;
    }
  }

  for (size_t j = 0; j < s10.extent(1); j++)
  {
    for (size_t i = 0; i < s10.extent(0); i++)
    {
      s10(i, j) = 42;
    }
  }

  for (size_t j = 0; j < s01.extent(1); j++)
  {
    for (size_t i = 0; i < s01.extent(0); i++)
    {
      s01(i, j) = 42;
    }
  }

  for (size_t j = 0; j < s11.extent(1); j++)
  {
    for (size_t i = 0; i < s11.extent(0); i++)
    {
      s11(i, j) = 42;
    }
  }

  for (size_t i = 0; i < 4 * 6; i++)
  {
    EXPECT(data[i] == 42);
  }
};

UNITTEST("contiguous_dims 1D", (slice<int, 1>()))
{
  using View = ::std::remove_reference_t<decltype(unittest_param)>;
  int a[10];
  auto s = reserved::make_mdview<View>(a, 10);
  EXPECT(contiguous_dims(s) == 1);
};

UNITTEST("contiguous_dims 2D", (slice<int, 2>()))
{
  using View = ::std::remove_reference_t<decltype(unittest_param)>;
  // This is non-contiguous
  int a[3 * 2];
  auto s = reserved::make_mdview<View>(a, ::std::tuple{3, 2}, 6);
  EXPECT(contiguous_dims(s) == 1);

  // This is contiguous
  int b[6 * 2];
  auto s2 = reserved::make_mdview<View>(b, ::std::tuple{6, 2}, 6);
  EXPECT(contiguous_dims(s2) == 2);
};

UNITTEST("contiguous_dims 3D", (slice<int, 3>()))
{
  using View = ::std::remove_reference_t<decltype(unittest_param)>;
  int a[3 * 2 * 10];
  auto s = reserved::make_mdview<View>(a, ::std::tuple{3, 2, 10}, 6, 12);
  EXPECT(contiguous_dims(s) == 1);

  auto s2 = reserved::make_mdview<View>(a, ::std::tuple{3, 2, 10}, 3, 3 * 2);
  EXPECT(contiguous_dims(s2) == 3);

  auto s3 = reserved::make_mdview<View>(a, ::std::tuple{3, 2, 10}, 3, 10);
  EXPECT(contiguous_dims(s3) == 2);
};

UNITTEST("implicit contiguous strides", (slice<int, 3>()))
{
  using View = ::std::remove_reference_t<decltype(unittest_param)>;
  int a[3 * 2 * 10];
  auto s = reserved::make_mdview<View>(a, 3, 2, 10);
  EXPECT(s.stride(0) == 1);
  EXPECT(s.stride(1) == 3);
  EXPECT(s.stride(2) == 3 * 2);
  EXPECT(contiguous_dims(s) == 3);
};

#  endif // STF_HAS_UNITTEST_WITH_ARGS
#endif // UNITTESTED_FILE

template <typename T, typename... P>
class shape_of<mdspan<T, P...>>
{
public:
  using described_type = mdspan<T, P...>;
  using coords_t       = array_tuple<size_t, described_type::rank()>;

  _CCCL_HOST_DEVICE static constexpr size_t rank()
  {
    return described_type::rank();
  }

  // Functions to create the begin and end iterators
  _CCCL_HOST_DEVICE auto begin()
  {
    ::std::array<size_t, rank()> sizes;
    unroll<rank()>([&](auto i) {
      sizes[i] = extents.extent(i);
    });
    return box<rank()>(sizes).begin();
  }

  _CCCL_HOST_DEVICE auto end()
  {
    ::std::array<size_t, rank()> sizes;
    unroll<rank()>([&](auto i) {
      sizes[i] = extents.extent(i);
    });
    return box<rank()>(sizes).end();
  }

  shape_of() = default;

  shape_of(const shape_of&) = default;

  explicit _CCCL_HOST_DEVICE shape_of(const described_type& x)
      : extents(x.extents())
      , strides(x.mapping().strides())
  {}

  explicit shape_of(const coords_t& sizes)
      : extents(reserved::to_cuda_array(sizes))
  {
    size_t product_sizes = 1;
    unroll<rank()>([&](auto i) {
      if (i == 0)
      {
        strides[i] = ::std::get<0>(sizes) != 0;
      }
      else
      {
        product_sizes *= extent(i - 1);
        strides[i] = product_sizes;
      }
    });
  }

  template <typename... Sizes>
  explicit shape_of(size_t size0, Sizes&&... sizes)
      : extents(size0, ::std::forward<Sizes>(sizes)...)
  {
    static_assert(sizeof...(sizes) + 1 == rank(), "Wrong number of arguments passed to shape_of.");

    strides[0]           = size0 != 0;
    size_t product_sizes = 1;
    for (size_t i = 1; i < rank(); ++i)
    {
      product_sizes *= extent(i - 1);
      strides[i] = product_sizes;
    }
  }

  explicit shape_of(const ::std::array<size_t, rank()>& sizes)
      : extents(reserved::convert_to_cuda_array(sizes))
  {}

  explicit shape_of(const ::std::array<size_t, rank()>& sizes, const ::std::array<size_t, rank()>& _strides)
      : shape_of(sizes)
  {
    if constexpr (rank() > 1)
    {
      size_t n = 0;
      for (auto i = _strides.begin(); i != _strides.end(); ++i)
      {
        strides[n++] = *i;
      }
      for (auto i = strides.rbegin() + 1; i != strides.rend(); ++i)
      {
        *i *= i[1];
      }
    }
  }

  bool operator==(const shape_of& other) const
  {
    return extents == other.extents && strides == other.strides;
  }

  constexpr _CCCL_HOST_DEVICE size_t extent(size_t dim) const
  {
    assert(dim < rank());
    return extents.extent(dim);
  }

  constexpr _CCCL_HOST_DEVICE size_t stride(size_t dim) const
  {
    assert(dim < rank());
    return strides[dim];
  }

  _CCCL_HOST_DEVICE size_t size() const
  {
    size_t result = 1;
    for (size_t i = 0; i != rank(); ++i)
    {
      result *= extents.extent(i);
    }
    return result;
  }

  _CCCL_HOST_DEVICE ::std::array<size_t, rank()> get_sizes() const
  {
    ::std::array<size_t, rank()> result;
    for (size_t i = 0; i < rank(); ++i)
    {
      result[i] = extents.extent(i);
    }
    return result;
  }

  dim4 get_data_dims() const
  {
    static_assert(rank() < 5);

    dim4 dims(0);
    if constexpr (rank() >= 1)
    {
      dims.x = static_cast<int>(extents.extent(0));
    }
    if constexpr (rank() >= 2)
    {
      dims.y = static_cast<int>(extents.extent(1));
    }
    if constexpr (rank() >= 3)
    {
      dims.z = static_cast<int>(extents.extent(2));
    }
    if constexpr (rank() >= 4)
    {
      dims.t = static_cast<int>(extents.extent(3));
    }
    return dims;
  }

  void set_contiguous_strides(const ::std::array<size_t, rank()>& sizes);

  described_type create(T* base) const
  {
    return described_type(base, typename described_type::mapping_type(extents, strides));
  }

  // This transforms a tuple of (shape, 1D index) into a coordinate
  _CCCL_HOST_DEVICE coords_t index_to_coords(size_t index) const
  {
    ::std::array<size_t, shape_of::rank()> coordinates{};
    // for (::std::ptrdiff_t i = _dimensions - 1; i >= 0; i--)
    for (auto i : each(0, shape_of::rank()))
    {
      coordinates[i] = index % extent(i);
      index /= extent(i);
    }

    return ::std::apply(
      [](const auto&... e) {
        return ::std::make_tuple(e...);
      },
      coordinates);
  }

private:
  typename described_type::extents_type extents{};
  ::cuda::std::array<typename described_type::index_type, described_type::rank()> strides{};
};

#ifdef UNITTESTED_FILE
#  ifdef STF_HAS_UNITTEST_WITH_ARGS
UNITTEST("shape_of for slice and mdspan", (slice<double, 3>()))
{
  using View = ::std::remove_reference_t<decltype(unittest_param)>;
  using s    = shape_of<View>;

  static_assert(s::rank() == 3);
  EXPECT(s::rank() == 3);

  s shape_obj1;
  assert(shape_obj1.get_sizes() == (::std::array<size_t, 3>{0, 0, 0}));
  EXPECT(shape_obj1.size() == 0);
  EXPECT(shape_obj1.extent(0) == 0);
  EXPECT(shape_obj1.extent(1) == 0);
  EXPECT(shape_obj1.extent(2) == 0);
  EXPECT(shape_obj1.stride(0) == 0);
  EXPECT(shape_obj1.stride(1) == 0);
  EXPECT(shape_obj1.stride(2) == 0);

  s shape_obj2{2, 3, 4};
  assert(shape_obj2.get_sizes() == (::std::array<size_t, 3>{2, 3, 4}));
  EXPECT(shape_obj2.size() == 2 * 3 * 4);
  EXPECT(shape_obj2.extent(0) == 2);
  EXPECT(shape_obj2.extent(1) == 3);
  EXPECT(shape_obj2.extent(2) == 4);
  EXPECT(shape_obj2.stride(0) == 1);
  EXPECT(shape_obj2.stride(1) == 2);
  EXPECT(shape_obj2.stride(2) == 2 * 3);

  s shape_obj3{{200, 13, 14}, {300, 27 * 300}};
  assert(shape_obj3.get_sizes() == (::std::array<size_t, 3>{200, 13, 14}));
  EXPECT(shape_obj3.size() == 200 * 13 * 14);
  EXPECT(shape_obj3.extent(0) == 200);
  EXPECT(shape_obj3.extent(1) == 13);
  EXPECT(shape_obj3.extent(2) == 14);

  for (auto i : shape_obj3)
  {
    EXPECT(i[0] < shape_obj3.extent(0));
    EXPECT(i[1] < shape_obj3.extent(1));
    EXPECT(i[2] < shape_obj3.extent(2));
  }
};

UNITTEST("3D slice should be similar to 3D mdspan", (slice<double, 3>()))
{
  // 3-dimensional array of 3 by 4 by 5
  using View = ::std::remove_reference_t<decltype(unittest_param)>;

  double data[3 * 4 * 5];
  for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i)
  {
    data[i] = 1.0 * i;
  }
  // Access each element of the array
  auto m = reserved::make_mdview<View>(data, ::std::tuple{3, 4, 5}, 3, 3 * 4);

  EXPECT(m.extent(0) == 3);
  EXPECT(m.extent(1) == 4);
  EXPECT(m.extent(2) == 5);

  for (size_t i = 0; i < m.extent(0); ++i)
  {
    for (size_t j = 0; j < m.extent(1); ++j)
    {
      for (size_t k = 0; k < m.extent(2); ++k)
      {
        EXPECT(m(i, j, k) == i + j * 3 + k * 3 * 4);
      }
    }
  }

  // An array without explicit strides should behave just the same as the one above
  m = reserved::make_mdview<View>(data, 3, 4, 5);

  EXPECT(m.extent(0) == 3);
  EXPECT(m.extent(1) == 4);
  EXPECT(m.extent(2) == 5);

  for (size_t i = 0; i < m.extent(0); ++i)
  {
    for (size_t j = 0; j < m.extent(1); ++j)
    {
      for (size_t k = 0; k < m.extent(2); ++k)
      {
        EXPECT(m(i, j, k) == i + j * 3 + k * 3 * 4);
      }
    }
  }

  // To access a subset of the array, use the same strides but different extents

  auto m2 = reserved::make_mdview<View>(data, ::std::tuple{3, 2, 1}, 3, 3 * 4);

  EXPECT(m2.extent(0) == 3);
  EXPECT(m2.extent(1) == 2);
  EXPECT(m2.extent(2) == 1);

  for (size_t i = 0; i < m2.extent(0); ++i)
  {
    for (size_t j = 0; j < m2.extent(1); ++j)
    {
      for (size_t k = 0; k < m2.extent(2); ++k)
      {
        // printf("m2(%zu, %zu, %zu) = %g\n", i, j, k, m2(i, j, k));
        m2(i, j, k) = -1;
      }
    }
  }

  // first dim is the fastest moving
  double witness[3 * 4 * 5] = {
    /*0, 0, 0*/ -1,
    /*1, 0, 0*/ -1,
    /*2, 0, 0*/ -1,
    /*0, 1, 0*/ -1,
    /*1, 1, 0*/ -1,
    /*2, 1, 0*/ -1,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16,
    17,
    18,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26,
    27,
    28,
    29,
    30,
    31,
    32,
    33,
    34,
    35,
    36,
    37,
    38,
    39,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    47,
    48,
    49,
    50,
    51,
    52,
    53,
    54,
    55,
    56,
    57,
    58,
    59};
  EXPECT(::std::equal(data, data + 3 * 4 * 5, witness));
};

#  endif // STF_HAS_UNITTEST_WITH_ARGS
#endif // UNITTESTED_FILE

template <typename T, typename... P>
bool pin(mdspan<T, P...>& s)
{
  // We need the rank as a constexpr value
  constexpr auto rank = mdspan<T, P...>::extents_type::rank();

  if (address_is_pinned(s.data_handle()))
  {
    return false;
  }

  if constexpr (rank == 0)
  {
    cuda_safe_call(pin_memory(s.data_handle(), 1));
  }
  else if constexpr (rank == 1)
  {
    cuda_safe_call(pin_memory(s.data_handle(), s.extent(0)));
  }
  else if constexpr (rank == 2)
  {
    switch (contiguous_dims(s))
    {
      case 1:
        for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
        {
          cuda_safe_call(pin_memory(&s(0, index_1) + index_1 * s.stride(1), s.extent(0)));
        }
        break;
      case 2:
        // fprintf(stderr, "PIN 2D - contiguous\n");
        cuda_safe_call(pin_memory(s.data_handle(), s.extent(0) * s.extent(1)));
        break;
      default:
        assert(false);
        abort();
    }
  }
  else
  {
    static_assert(rank == 3, "Dimensionality not supported.");
    switch (contiguous_dims(s))
    {
      case 1:
        for (size_t index_2 = 0; index_2 < s.extent(2); index_2++)
        {
          for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
          {
            // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0));
            cuda_safe_call(pin_memory(&s(0, index_1, index_2), s.extent(0)));
          }
        }
        break;
      case 2:
        for (size_t index_2 = 0; index_2 < s.extent(2); index_2++)
        {
          cuda_safe_call(pin_memory(&s(0, 0, index_2), s.extent(0) * s.extent(1)));
        }
        break;
      case 3:
        // fprintf(stderr, "PIN 3D - contiguous\n");
        cuda_safe_call(pin_memory(s.data_handle(), s.extent(0) * s.extent(1) * s.extent(2)));
        break;
      default:
        assert(false);
        abort();
    }
  }

  return true;
}

template <typename T, typename... P>
void unpin(mdspan<T, P...>& s)
{
  // We need the rank as a constexpr value
  constexpr auto rank = mdspan<T, P...>::extents_type::rank();

  if constexpr (rank == 0)
  {
    unpin_memory(s.data_handle());
  }
  else if constexpr (rank == 1)
  {
    unpin_memory(s.data_handle());
  }
  else if constexpr (rank == 2)
  {
    switch (contiguous_dims(s))
    {
      case 1:
        for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
        {
          unpin_memory(&s(0, index_1) + index_1 * s.extent(0));
        }
        break;
      case 2:
        // fprintf(stderr, "PIN 2D - contiguous\n");
        unpin_memory(s.data_handle());
        break;
      default:
        assert(false);
        abort();
    }
  }
  else
  {
    static_assert(rank == 3, "Dimensionality not supported.");
    switch (contiguous_dims(s))
    {
      case 1:
        for (size_t index_2 = 0; index_2 < s.extent(2); index_2++)
        {
          for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
          {
            // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0));
            unpin_memory(&s(0, index_1, index_2));
          }
        }
        break;
      case 2:
        for (size_t index_2 = 0; index_2 < s.extent(2); index_2++)
        {
          unpin_memory(&s(0, 0, index_2));
        }
        break;
      case 3:
        // fprintf(stderr, "PIN 3D - contiguous\n");
        unpin_memory(s.data_handle());
        break;
      default:
        assert(false);
        abort();
    }
  }
}

_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_MSVC(4702) // unreachable code

template <typename E, typename X, typename L, typename A, size_t... i>
size_t data_hash([[maybe_unused]] mdspan<E, X, L, A> s, ::std::index_sequence<i...> = ::std::index_sequence<>())
{
  using Slice = mdspan<E, X, L, A>;
  if constexpr (!reserved::has_std_hash_v<E> && !reserved::has_cudastf_hash_v<E>)
  {
    fprintf(stderr, "Error: cannot compute data_hash on a mdspan<E, ...> if ::std::hash<E> is not defined.\n");
    abort();
    return 0;
  }
  else
  {
    if constexpr (sizeof...(i) != Slice::rank())
    {
      return data_hash(s, ::std::make_index_sequence<Slice::rank()>());
    }
    else
    {
      if (s.size() == 0)
      {
        return 0;
      }

      size_t h          = 0;
      auto content_hash = [&](auto... indices) -> void {
        for (;;)
        {
          cuda::experimental::stf::hash_combine(h, s(indices...));

          bool bump = true;
          each_in_pack(
            [&](auto current_dim, size_t& index) {
              if (!bump)
              {
                return;
              }
              ++index;
              if (index >= s.extent(current_dim))
              {
                index = 0;
              }
              else
              {
                bump = false;
              }
            },
            indices...);
          if (bump)
          {
            // Done with all dimensions
            break;
          }
        }
      };
      content_hash((i * 0)...);
      return h;
    }
  }
}
_CCCL_DIAG_POP

template <typename E, typename X, typename L, typename A, size_t... i>
void data_dump([[maybe_unused]] mdspan<E, X, L, A> s,
               ::std::ostream& file        = ::std::cerr,
               ::std::index_sequence<i...> = ::std::index_sequence<>())
{
  using Slice = mdspan<E, X, L, A>;
  if constexpr (reserved::has_ostream_operator<E>::value)
  {
    if constexpr (sizeof...(i) != Slice::rank())
    {
      return data_dump(s, file, ::std::make_index_sequence<Slice::rank()>());
    }
    else
    {
      if (s.size() == 0)
      {
        return;
      }

      auto print_element = [&](auto... indices) -> void {
        for (;;)
        {
          file << s(indices...) << ' ';
          bool bump = true;
          each_in_pack(
            [&](auto current_dim, size_t& index) {
              if (!bump)
              {
                return;
              }
              ++index;
              if (index >= s.extent(current_dim))
              {
                // Done printing last element on the current dimension, move to the next one
                if (current_dim < Slice::rank() - 1)
                {
                  file << "; ";
                }
                index = 0;
              }
              else
              {
                bump = false;
              }
            },
            indices...);
          if (bump)
          {
            // Done with all dimensions
            break;
          }
        }
      };

      print_element((i * 0)...);
      file << '\n';
    }
  }
  else
  {
    ::std::cerr << "Unsupported typed." << ::std::endl;
  }
}

#ifdef UNITTESTED_FILE
UNITTEST("data_dump")
{
  double data[4] = {1, 2, 3, 4};
  auto s         = make_slice(data, 4);
  ::std::cerr << type_name<decltype(s)> << '\n';
  data_dump(s, ::std::cerr);
  auto s1 = make_slice(data, 2, 2);
  ::std::cerr << type_name<decltype(s1)> << '\n';
  data_dump(s1, ::std::cerr);
};

UNITTEST("data_hash")
{
  // Select a random filename
  double data[4] = {1, 2, 3, 4};
  auto s         = make_slice(data, 4);

  double same_data[4] = {1, 2, 3, 4};
  auto same_s         = make_slice(same_data, 4);

  EXPECT(data_hash(s) == data_hash(same_s));

  double other_data[5] = {1, 2, 3, 4, 5};
  auto other_s         = make_slice(other_data, 5);

  EXPECT(data_hash(s) != data_hash(other_s));
};
#endif

// Note this is implementing it in the cudastf namespace, not std::hash
template <typename... P>
struct hash<mdspan<P...>>
{
  ::std::size_t operator()(mdspan<P...> const& s) const noexcept
  {
    static constexpr auto _dimensions = mdspan<P...>::rank();
    // Combine hashes from the ptr, sizes and strides
    size_t h = 0;
    hash_combine(h, s.data_handle());

    if constexpr (_dimensions > 0)
    {
      for (size_t i = 0; i < _dimensions; i++)
      {
        hash_combine(h, s.extent(i));
      }
    }

    if constexpr (_dimensions > 1)
    {
      for (size_t i = 1; i < _dimensions; i++)
      {
        hash_combine(h, s.stride(i));
      }
    }

    return h;
  }
};

#ifdef UNITTESTED_FILE

UNITTEST("slice hash")
{
  double A[5 * 2];
  auto s  = make_slice(A, ::std::tuple{5, 2}, 5);
  auto s2 = make_slice(A, ::std::tuple{4, 2}, 5);

  size_t h  = hash<slice<double, 2>>{}(s);
  size_t h2 = hash<slice<double, 2>>{}(s2);

  EXPECT(h != h2);
};

UNITTEST("slice hash 3D")
{
  double A[5 * 2 * 40];

  // contiguous
  auto s = make_slice(A, ::std::tuple{5, 2, 40}, 5, 5 * 2);

  // non-contiguous
  auto s2 = make_slice(A, ::std::tuple{4, 2, 40}, 5, 5 * 2);

  size_t h  = hash<slice<double, 3>>{}(s);
  size_t h2 = hash<slice<double, 3>>{}(s2);

  EXPECT(h != h2);
};

UNITTEST("shape_of<slice> basics")
{
  using namespace cuda::experimental::stf;
  auto s1 = shape_of<slice<double, 3>>(1, 2, 3);
  auto s2 = shape_of<slice<double, 3>>(::std::tuple(1, 2, 3));
  EXPECT(s1 == s2);
};

#endif // UNITTESTED_FILE

} // namespace cuda::experimental::stf