From 79f9c21116c58291f21487984c747446117341bf Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Tue, 14 Apr 2026 16:52:46 -0700 Subject: [PATCH 1/8] Add CCCL patch for 128bit atomics --- CMakeLists.txt | 3 +- cmake/cccl_override.json | 17 ++ cmake/patches/cccl_fix_128bit_cas.patch | 238 ++++++++++++++++++++++++ 3 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 cmake/cccl_override.json create mode 100644 cmake/patches/cccl_fix_128bit_cas.patch diff --git a/CMakeLists.txt b/CMakeLists.txt index 884df20f6..483ee5ba8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) set(rapids-cmake-version 26.04) +set(rapids-cmake-branch "release/${rapids-cmake-version}") if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/release/${rapids-cmake-version}/RAPIDS.cmake @@ -74,7 +75,7 @@ rapids_find_package( ################################################################################################### # - find packages we depend on -------------------------------------------------------------------- -rapids_cpm_init() +rapids_cpm_init(OVERRIDE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cccl_override.json") include(cmake/thirdparty/get_cccl.cmake) diff --git a/cmake/cccl_override.json b/cmake/cccl_override.json new file mode 100644 index 000000000..32dc723ca --- /dev/null +++ b/cmake/cccl_override.json @@ -0,0 +1,17 @@ +{ + "packages": { + "CCCL": { + "version": "3.3.0", + "git_url": "https://github.com/NVIDIA/cccl.git", + "git_tag": "09094af138841ef521de1adbbdd18ab8b3dad47b", + "git_shallow": false, + "patches": [ + { + "file": "${current_json_dir}/patches/cccl_fix_128bit_cas.patch", + "issue": "Fix 128-bit atomic CAS operand indices [https://github.com/NVIDIA/cccl/issues/8402]", + "fixed_in": "3.3.2" + } + ] + } + } +} diff --git a/cmake/patches/cccl_fix_128bit_cas.patch b/cmake/patches/cccl_fix_128bit_cas.patch new file mode 100644 index 000000000..fc17651e4 --- /dev/null +++ b/cmake/patches/cccl_fix_128bit_cas.patch @@ -0,0 +1,238 @@ +From 1898944000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Daniel Juenger +Date: Mon, 14 Apr 2026 00:00:00 +0000 +Subject: [PATCH] Fix codegen in 128bit atomic CAS (#8403) + +Fix wrong inline asm operand indices in all atom.cas.*.b128 variants. +See https://github.com/NVIDIA/cccl/issues/8402 +--- + .../cuda/std/__atomic/functions/cuda_ptx_generated.h | 80 ++++++++++---------- + 1 file changed, 40 insertions(+), 40 deletions(-) + +diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +index f3e30d53039..479815f4136 100644 +--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h ++++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +@@ -1585,8 +1585,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acquire.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1604,8 +1604,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acquire.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1623,8 +1623,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acquire.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1642,8 +1642,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acquire.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1661,8 +1661,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.relaxed.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1680,8 +1680,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1699,8 +1699,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1718,8 +1718,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.relaxed.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1737,8 +1737,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.release.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1756,8 +1756,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.release.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1775,8 +1775,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.release.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1794,8 +1794,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.release.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1813,8 +1813,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1832,8 +1832,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1851,8 +1851,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1870,8 +1870,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1889,8 +1889,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1908,8 +1908,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1927,8 +1927,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1946,8 +1946,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +-- +2.45.2 + From c78c8b2cf225f84df2a0b9b7ada85d18c547b5fc Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Tue, 14 Apr 2026 18:23:54 -0700 Subject: [PATCH 2/8] Initial 128bit atomic support --- include/cuco/detail/__config | 34 ++ include/cuco/detail/bitwise_compare.cuh | 10 + .../open_addressing/open_addressing_impl.cuh | 6 +- .../open_addressing_ref_impl.cuh | 94 ++++-- include/cuco/detail/pair/helpers.cuh | 13 + include/cuco/detail/pair/pair.inl | 4 +- include/cuco/detail/static_map/static_map.inl | 6 +- include/cuco/static_map.cuh | 6 +- include/cuco/static_map_ref.cuh | 4 +- include/cuco/static_multimap.cuh | 6 +- tests/CMakeLists.txt | 2 + tests/static_map/large_type_test.cu | 291 ++++++++++++++++++ tests/static_set/large_key_type_test.cu | 135 ++++++++ 13 files changed, 575 insertions(+), 36 deletions(-) create mode 100644 tests/static_map/large_type_test.cu create mode 100644 tests/static_set/large_key_type_test.cu diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index c2cc9066a..1cf231a05 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -19,6 +19,8 @@ #include #include +#include + #if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__) #error "NVCC version not found" #elif __CUDACC_VER_MAJOR__ < 12 @@ -52,6 +54,38 @@ #define CUCO_HAS_INT128 #endif +#if defined(CUCO_HAS_INT128) && (CUCO_CUDA_MINIMUM_ARCH >= 900) +#define CUCO_HAS_128BIT_ATOMICS +#endif + #if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000) #define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC #endif + +namespace cuco::detail { + +/// Maximum supported key size (in bytes) for open-addressing containers. +inline constexpr std::size_t max_key_size = +#if defined(CUCO_HAS_128BIT_ATOMICS) + 16; +#else + 8; +#endif + +/// Maximum supported payload/mapped type size (in bytes) for open-addressing containers. +inline constexpr std::size_t max_payload_size = max_key_size; + +/// Maximum supported slot size (in bytes) for open-addressing containers. +inline constexpr std::size_t max_slot_size = 2 * max_key_size; + +/// Checks if the given size is a valid mapped_type size for packed CAS operations. +inline constexpr bool is_valid_mapped_size(std::size_t n) +{ + return n == 4 or n == 8 +#if defined(CUCO_HAS_128BIT_ATOMICS) + or n == 16 +#endif + ; +} + +} // namespace cuco::detail diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index 1b9e80f4d..4c4afd411 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -63,6 +63,16 @@ struct bitwise_compare_impl<8> { } }; +template <> +struct bitwise_compare_impl<16> { + __host__ __device__ inline static bool compare(char const* lhs, char const* rhs) + { + return *reinterpret_cast(lhs) == *reinterpret_cast(rhs) and + *reinterpret_cast(lhs + 8) == + *reinterpret_cast(rhs + 8); + } +}; + /** * @brief Gives value to use as alignment for a type that is at least the * size of type, or 16, whichever is smaller. diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 55d423e97..830d60ff2 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -69,9 +69,11 @@ template class open_addressing_impl { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + static_assert(sizeof(Key) <= cuco::detail::max_key_size, + "Key size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); - static_assert(sizeof(Value) <= 16, "Container does not support slot types larger than 16 bytes."); + static_assert(sizeof(Value) <= cuco::detail::max_slot_size, + "Slot size exceeds the maximum supported size (16 bytes, or 32 with sm_90+)."); static_assert( cuco::is_bitwise_comparable_v, diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index b6a11b345..3d792c9b5 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -89,7 +89,8 @@ template class open_addressing_ref_impl { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + static_assert(sizeof(Key) <= cuco::detail::max_key_size, + "Key size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); static_assert( cuco::is_bitwise_comparable_v, @@ -523,9 +524,9 @@ class open_addressing_ref_impl { #if __CUDA_ARCH__ < 700 // Spinning to ensure that the write to the value part took place requires // independent thread scheduling introduced with the Volta architecture. - static_assert( - cuco::detail::is_packable(), - "insert_and_find is not supported for pair types larger than 8 bytes on pre-Volta GPUs."); + static_assert(sizeof(value_type) <= 8, + "insert_and_find is not supported for slot types larger than 8 bytes on " + "pre-Volta GPUs."); #endif auto const val = this->heterogeneous_value(value); @@ -544,25 +545,40 @@ class open_addressing_ref_impl { // If the key is already in the container, return false if (eq_res == detail::equal_result::EQUAL) { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place + if constexpr (has_payload and sizeof(value_type) > 8) { +#if (__CUDA_ARCH__ >= 900) + if constexpr (not cuco::detail::is_packable()) { + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); + } +#else this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); +#endif } return {iterator{slot_ptr}, false}; } if (eq_res == detail::equal_result::AVAILABLE) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place + if constexpr (has_payload and sizeof(value_type) > 8) { +#if (__CUDA_ARCH__ >= 900) + if constexpr (not cuco::detail::is_packable()) { + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); + } +#else this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); +#endif } return {iterator{slot_ptr}, true}; } case insert_result::DUPLICATE: { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place + if constexpr (has_payload and sizeof(value_type) > 8) { +#if (__CUDA_ARCH__ >= 900) + if constexpr (not cuco::detail::is_packable()) { + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); + } +#else this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); +#endif } return {iterator{slot_ptr}, false}; } @@ -598,9 +614,9 @@ class open_addressing_ref_impl { #if __CUDA_ARCH__ < 700 // Spinning to ensure that the write to the value part took place requires // independent thread scheduling introduced with the Volta architecture. - static_assert( - cuco::detail::is_packable(), - "insert_and_find is not supported for pair types larger than 8 bytes on pre-Volta GPUs."); + static_assert(sizeof(value_type) <= 8, + "insert_and_find is not supported for slot types larger than 8 bytes on " + "pre-Volta GPUs."); #endif auto const val = this->heterogeneous_value(value); @@ -632,9 +648,14 @@ class open_addressing_ref_impl { auto const src_lane = __ffs(group_finds_equal) - 1; auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); if (group.thread_rank() == src_lane) { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place + if constexpr (has_payload and sizeof(value_type) > 8) { +#if (__CUDA_ARCH__ >= 900) + if constexpr (not cuco::detail::is_packable()) { + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); + } +#else this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); +#endif } } group.sync(); @@ -653,9 +674,14 @@ class open_addressing_ref_impl { switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: { if (group.thread_rank() == src_lane) { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place + if constexpr (has_payload and sizeof(value_type) > 8) { +#if (__CUDA_ARCH__ >= 900) + if constexpr (not cuco::detail::is_packable()) { + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); + } +#else this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); +#endif } } group.sync(); @@ -663,9 +689,14 @@ class open_addressing_ref_impl { } case insert_result::DUPLICATE: { if (group.thread_rank() == src_lane) { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place + if constexpr (has_payload and sizeof(value_type) > 8) { +#if (__CUDA_ARCH__ >= 900) + if constexpr (not cuco::detail::is_packable()) { + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); + } +#else this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); +#endif } } group.sync(); @@ -1729,8 +1760,7 @@ class open_addressing_ref_impl { value_type expected, Value desired) noexcept { - using packed_type = - cuda::std::conditional_t; + using packed_type = cuco::detail::packed_t; auto* slot_ptr = reinterpret_cast(address); auto* expected_ptr = reinterpret_cast(&expected); @@ -1865,7 +1895,16 @@ class open_addressing_ref_impl { { if constexpr (sizeof(value_type) <= 8) { return packed_cas(address, expected, desired); - } else { + } +#if (__CUDA_ARCH__ >= 900) + else if constexpr (cuco::detail::is_packable()) { + return packed_cas(address, expected, desired); + } +#endif + else { + static_assert( + has_payload, + "16-byte key types in key-only containers require sm_90+ for 128-bit atomic CAS."); #if (__CUDA_ARCH__ < 700) return cas_dependent_write(address, expected, desired); #else @@ -1898,7 +1937,16 @@ class open_addressing_ref_impl { { if constexpr (sizeof(value_type) <= 8) { return packed_cas(address, expected, desired); - } else { + } +#if (__CUDA_ARCH__ >= 900) + else if constexpr (cuco::detail::is_packable()) { + return packed_cas(address, expected, desired); + } +#endif + else { + static_assert( + has_payload, + "16-byte key types in key-only containers require sm_90+ for 128-bit atomic CAS."); return cas_dependent_write(address, expected, desired); } } diff --git a/include/cuco/detail/pair/helpers.cuh b/include/cuco/detail/pair/helpers.cuh index 063e1e783..c21cbdb86 100644 --- a/include/cuco/detail/pair/helpers.cuh +++ b/include/cuco/detail/pair/helpers.cuh @@ -15,6 +15,8 @@ #pragma once +#include + #include #include #include @@ -61,6 +63,17 @@ struct packed { using type = uint32_t; ///< Packed type as `uint32_t` if the size of the object is 4 }; +#ifdef CUCO_HAS_INT128 +/** + * @brief Denotes the packed type when the size of the object is 16. + */ +template <> +struct packed<16> { + using type = unsigned __int128; ///< Packed type as `unsigned __int128` if the size of the object + ///< is 16 +}; +#endif + template using packed_t = typename packed::type; diff --git a/include/cuco/detail/pair/pair.inl b/include/cuco/detail/pair/pair.inl index 359829db3..5a1715125 100644 --- a/include/cuco/detail/pair/pair.inl +++ b/include/cuco/detail/pair/pair.inl @@ -23,14 +23,14 @@ namespace cuco { template __host__ __device__ constexpr pair::pair(First const& f, Second const& s) - : first{f}, second{s} + : first(f), second(s) { } template template __host__ __device__ constexpr pair::pair(pair const& p) - : first{p.first}, second{p.second} + : first(p.first), second(p.second) { } diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 9b143d8bb..1d6cbbd67 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -55,7 +55,7 @@ constexpr static_map>, class Storage = cuco::storage<1>> class static_map { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + static_assert(sizeof(Key) <= cuco::detail::max_key_size, + "Key size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); - static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); + static_assert(sizeof(T) <= cuco::detail::max_payload_size, + "Payload size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); static_assert(cuco::is_bitwise_comparable_v, "Mapped type must have unique object representations or have been explicitly " diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 24525c3bb..be23b6dd4 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -75,8 +75,8 @@ class static_map_ref using impl_type = detail:: open_addressing_ref_impl; - static_assert(sizeof(T) == 4 or sizeof(T) == 8, - "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); + static_assert(cuco::detail::is_valid_mapped_size(sizeof(T)), + "sizeof(mapped_type) must be 4 or 8 bytes (or 16 with sm_90+)."); static_assert( cuco::is_bitwise_comparable_v, diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index aee2c24bd..957630a77 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -91,9 +91,11 @@ template >, class Storage = cuco::storage<2>> class static_multimap { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + static_assert(sizeof(Key) <= cuco::detail::max_key_size, + "Key size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); - static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); + static_assert(sizeof(T) <= cuco::detail::max_payload_size, + "Payload size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); static_assert(cuco::is_bitwise_comparable_v, "Mapped type must have unique object representations or have been explicitly " diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 205b0100b..5806a21f2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -64,6 +64,7 @@ ConfigureTest(STATIC_SET_TEST static_set/heterogeneous_lookup_test.cu static_set/insert_and_find_test.cu static_set/large_input_test.cu + static_set/large_key_type_test.cu static_set/retrieve_test.cu static_set/retrieve_all_test.cu static_set/retrieve_if_test.cu @@ -88,6 +89,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/insert_or_assign_test.cu static_map/insert_or_apply_test.cu static_map/key_sentinel_test.cu + static_map/large_type_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu static_map/rehash_test.cu diff --git a/tests/static_map/large_type_test.cu b/tests/static_map/large_type_test.cu new file mode 100644 index 000000000..be6fe605c --- /dev/null +++ b/tests/static_map/large_type_test.cu @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include + +#include + +struct alignas(16) uint128_t { + cuda::std::uint64_t lo; + cuda::std::uint64_t hi; + + __host__ __device__ constexpr bool operator==(uint128_t const& o) const + { + return lo == o.lo and hi == o.hi; + } + __host__ __device__ constexpr bool operator!=(uint128_t const& o) const { return !(*this == o); } +}; + +CUCO_DECLARE_BITWISE_COMPARABLE(uint128_t) + +using size_type = int32_t; + +TEST_CASE("static_map 128-bit packed CAS", "") +{ + using Key = int64_t; + using Value = int64_t; + using probe = cuco::linear_probing<1, cuco::default_hash_function>; + + constexpr size_type num_keys{400}; + + auto map = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key const& x) { + return cuco::pair(x, static_cast(x)); + })); + + thrust::device_vector d_contained(num_keys); + + SECTION("insert + contains") + { + auto const inserted = map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(inserted == num_keys); + REQUIRE(map.size() == num_keys); + + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + } + + SECTION("insert + find") + { + map.insert(pairs_begin, pairs_begin + num_keys); + + thrust::device_vector d_results(num_keys); + map.find(keys_begin, keys_begin + num_keys, d_results.begin()); + + auto zip_equal = cuda::proclaim_return_type( + [] __device__(auto const& p) { return cuda::std::get<0>(p) == cuda::std::get<1>(p); }); + auto zip = thrust::make_zip_iterator( + cuda::std::tuple{d_results.begin(), + cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type([] __device__(Key const& x) { + return static_cast(x); + }))}); + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } + + SECTION("insert + erase + re-insert") + { + auto erase_map = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{num_keys * 2, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + erase_map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(erase_map.size() == num_keys); + + erase_map.erase(keys_begin, keys_begin + num_keys); + REQUIRE(erase_map.size() == 0); + + erase_map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(erase_map.size() == num_keys); + + erase_map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + } + + SECTION("insert_and_find") + { + thrust::device_vector found1(num_keys); + thrust::device_vector inserted(num_keys); + + map.insert_and_find(pairs_begin, pairs_begin + num_keys, found1.begin(), inserted.begin()); + REQUIRE(cuco::test::all_of(inserted.begin(), inserted.end(), cuda::std::identity{})); + + thrust::device_vector found2(num_keys); + map.insert_and_find(pairs_begin, pairs_begin + num_keys, found2.begin(), inserted.begin()); + REQUIRE(cuco::test::none_of(inserted.begin(), inserted.end(), cuda::std::identity{})); + + REQUIRE( + cuco::test::equal(found1.begin(), found1.end(), found2.begin(), cuda::std::equal_to{})); + } +} + +#if defined(CUCO_HAS_128BIT_ATOMICS) + +TEST_CASE("static_map 128-bit key b2b CAS", "") +{ + using Key = uint128_t; + using Value = int64_t; + using probe = cuco::linear_probing<1, cuco::default_hash_function>; + + constexpr size_type num_keys{400}; + + Key const empty_key{~0ULL, ~0ULL}; + Key const erased_key{~0ULL - 1, ~0ULL}; + + auto map = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{num_keys * 2, + cuco::empty_key{empty_key}, + cuco::empty_value{-1}, + cuco::erased_key{erased_key}}; + + auto keys_begin = cuda::make_transform_iterator( + cuda::counting_iterator(1), + cuda::proclaim_return_type( + [] __device__(size_type i) -> Key { return Key{static_cast(i), 0}; })); + + auto pairs_begin = cuda::make_transform_iterator( + cuda::counting_iterator(1), + cuda::proclaim_return_type>([] __device__(size_type i) { + return cuco::pair{Key{static_cast(i), 0}, + static_cast(i)}; + })); + + thrust::device_vector d_contained(num_keys); + + SECTION("insert + contains") + { + auto const inserted = map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(inserted == num_keys); + + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + } + + SECTION("insert + find") + { + map.insert(pairs_begin, pairs_begin + num_keys); + + thrust::device_vector d_results(num_keys); + map.find(keys_begin, keys_begin + num_keys, d_results.begin()); + + auto zip_equal = cuda::proclaim_return_type( + [] __device__(auto const& p) { return cuda::std::get<0>(p) == cuda::std::get<1>(p); }); + auto gold = + cuda::make_transform_iterator(cuda::counting_iterator(1), + cuda::proclaim_return_type([] __device__(size_type i) { + return static_cast(i); + })); + auto zip = thrust::make_zip_iterator(cuda::std::tuple{d_results.begin(), gold}); + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } + + SECTION("insert + erase + re-insert") + { + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + + map.erase(keys_begin, keys_begin + num_keys); + REQUIRE(map.size() == 0); + + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + } +} + +TEST_CASE("static_map 128-bit key and value", "") +{ + using Key = uint128_t; + using Value = uint128_t; + using probe = cuco::linear_probing<1, cuco::default_hash_function>; + + constexpr size_type num_keys{400}; + + Key const empty_key{~0ULL, ~0ULL}; + Value const empty_value{~0ULL, ~0ULL}; + + auto map = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ + num_keys, cuco::empty_key{empty_key}, cuco::empty_value{empty_value}}; + + auto keys_begin = cuda::make_transform_iterator( + cuda::counting_iterator(1), + cuda::proclaim_return_type( + [] __device__(size_type i) -> Key { return Key{static_cast(i), 0}; })); + + auto pairs_begin = cuda::make_transform_iterator( + cuda::counting_iterator(1), + cuda::proclaim_return_type>([] __device__(size_type i) { + return cuco::pair{Key{static_cast(i), 0}, + Value{static_cast(i * 10), 0}}; + })); + + thrust::device_vector d_contained(num_keys); + + SECTION("insert + contains") + { + auto const inserted = map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(inserted == num_keys); + REQUIRE(map.size() == num_keys); + + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + } + + SECTION("insert + find") + { + map.insert(pairs_begin, pairs_begin + num_keys); + + thrust::device_vector d_results(num_keys); + map.find(keys_begin, keys_begin + num_keys, d_results.begin()); + + auto zip_equal = cuda::proclaim_return_type([] __device__(auto const& p) { + return static_cast(cuda::std::get<0>(p)) == static_cast(cuda::std::get<1>(p)); + }); + auto gold = cuda::make_transform_iterator( + cuda::counting_iterator(1), + cuda::proclaim_return_type([] __device__(size_type i) -> Value { + return Value{static_cast(i * 10), 0}; + })); + auto zip = thrust::make_zip_iterator(cuda::std::tuple{d_results.begin(), gold}); + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } +} + +#endif // CUCO_HAS_128BIT_ATOMICS diff --git a/tests/static_set/large_key_type_test.cu b/tests/static_set/large_key_type_test.cu new file mode 100644 index 000000000..6212f0087 --- /dev/null +++ b/tests/static_set/large_key_type_test.cu @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#if defined(CUCO_HAS_128BIT_ATOMICS) + +#include + +#include + +#include +#include +#include +#include + +#include + +struct alignas(16) uint128_t { + cuda::std::uint64_t lo; + cuda::std::uint64_t hi; + + __host__ __device__ constexpr bool operator==(uint128_t const& o) const + { + return lo == o.lo and hi == o.hi; + } + __host__ __device__ constexpr bool operator!=(uint128_t const& o) const { return !(*this == o); } +}; + +CUCO_DECLARE_BITWISE_COMPARABLE(uint128_t) + +TEST_CASE("static_set 128-bit key unique sequence", "") +{ + using Key = uint128_t; + + constexpr std::size_t num_keys{400}; + + Key const empty_sentinel{~0ULL, ~0ULL}; + + using probe = cuco::linear_probing<1, cuco::default_hash_function>; + + auto set = cuco::static_set{ + num_keys, cuco::empty_key{empty_sentinel}, {}, probe{}, {}, cuco::storage<2>{}}; + + auto keys_begin = cuda::make_transform_iterator( + cuda::counting_iterator(0), + cuda::proclaim_return_type( + [] __device__(std::size_t i) -> Key { return Key{static_cast(i), 0}; })); + + thrust::device_vector d_contained(num_keys); + + SECTION("Non-inserted keys should not be contained.") + { + REQUIRE(set.size() == 0); + + set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + } + + SECTION("All inserted keys should be contained.") + { + auto const inserted = set.insert(keys_begin, keys_begin + num_keys); + REQUIRE(inserted == num_keys); + REQUIRE(set.size() == num_keys); + + set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); + } + + SECTION("All inserted keys should be correctly recovered during find") + { + set.insert(keys_begin, keys_begin + num_keys); + + thrust::device_vector d_results(num_keys); + set.find(keys_begin, keys_begin + num_keys, d_results.begin()); + + auto zip_equal = cuda::proclaim_return_type([] __device__(auto const& p) { + return static_cast(cuda::std::get<0>(p)) == static_cast(cuda::std::get<1>(p)); + }); + auto zip = thrust::make_zip_iterator(cuda::std::tuple{d_results.begin(), keys_begin}); + + REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); + } +} + +TEST_CASE("static_set 128-bit key insert_and_find", "") +{ + using Key = uint128_t; + + constexpr std::size_t num_keys{400}; + + Key const empty_sentinel{~0ULL, ~0ULL}; + + using probe = cuco::linear_probing<1, cuco::default_hash_function>; + + auto set = cuco::static_set{ + num_keys, cuco::empty_key{empty_sentinel}, {}, probe{}, {}, cuco::storage<2>{}}; + + auto keys_begin = cuda::make_transform_iterator( + cuda::counting_iterator(0), + cuda::proclaim_return_type( + [] __device__(std::size_t i) -> Key { return Key{static_cast(i), 0}; })); + + thrust::device_vector iters1(num_keys); + thrust::device_vector inserted(num_keys); + + // insert first time, fills inserted with true + set.insert_and_find(keys_begin, keys_begin + num_keys, iters1.begin(), inserted.begin()); + REQUIRE(cuco::test::all_of(inserted.begin(), inserted.end(), cuda::std::identity{})); + + // insert second time, fills inserted with false as keys already in set + thrust::device_vector iters2(num_keys); + set.insert_and_find(keys_begin, keys_begin + num_keys, iters2.begin(), inserted.begin()); + REQUIRE(cuco::test::none_of(inserted.begin(), inserted.end(), cuda::std::identity{})); + + // both iters1 and iters2 should be same, as keys will be referring to same slot + auto equal_fn = cuda::proclaim_return_type( + [] __device__(auto const& a, auto const& b) { return a == b; }); + REQUIRE(cuco::test::equal(iters1.begin(), iters1.end(), iters2.begin(), equal_fn)); +} + +#endif // CUCO_HAS_128BIT_ATOMICS From 640f775e1d891d5b695e203544ddcc349b0439a4 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 17 Apr 2026 00:49:29 +0000 Subject: [PATCH 3/8] Update tests --- tests/CMakeLists.txt | 2 - tests/static_map/contains_test.cu | 11 +- tests/static_map/duplicate_keys_test.cu | 11 +- tests/static_map/erase_test.cu | 11 +- tests/static_map/find_test.cu | 11 +- tests/static_map/for_each_test.cu | 11 +- tests/static_map/hash_test.cu | 12 +- tests/static_map/insert_and_find_test.cu | 11 +- tests/static_map/insert_or_apply_test.cu | 23 +- tests/static_map/insert_or_assign_test.cu | 11 +- tests/static_map/key_sentinel_test.cu | 12 +- tests/static_map/large_type_test.cu | 291 ------------------ tests/static_map/retrieve_if_test.cu | 8 +- tests/static_map/retrieve_test.cu | 11 +- tests/static_map/stream_test.cu | 8 +- tests/static_multimap/count_test.cu | 11 +- tests/static_multimap/find_test.cu | 11 +- tests/static_multimap/for_each_test.cu | 11 +- tests/static_multimap/insert_contains_test.cu | 11 +- tests/static_multimap/insert_if_test.cu | 11 +- tests/static_multimap/multiplicity_test.cu | 11 +- tests/static_multimap/retrieve_if_test.cu | 8 +- tests/static_multimap/retrieve_test.cu | 11 +- tests/static_multiset/contains_test.cu | 11 +- tests/static_multiset/count_test.cu | 11 +- tests/static_multiset/custom_count_test.cu | 11 +- tests/static_multiset/find_test.cu | 11 +- tests/static_multiset/for_each_test.cu | 11 +- tests/static_multiset/insert_test.cu | 11 +- tests/static_multiset/load_factor_test.cu | 11 +- tests/static_multiset/retrieve_if_test.cu | 13 +- tests/static_multiset/retrieve_test.cu | 11 +- tests/static_multiset/stream_test.cu | 8 +- tests/static_set/for_each_test.cu | 13 +- tests/static_set/insert_and_find_test.cu | 15 +- tests/static_set/large_key_type_test.cu | 135 -------- tests/static_set/retrieve_all_test.cu | 14 +- tests/static_set/retrieve_if_test.cu | 14 +- tests/static_set/retrieve_test.cu | 18 +- tests/static_set/stream_test.cu | 10 +- tests/static_set/unique_sequence_test.cu | 19 +- 41 files changed, 397 insertions(+), 479 deletions(-) delete mode 100644 tests/static_map/large_type_test.cu delete mode 100644 tests/static_set/large_key_type_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 5806a21f2..205b0100b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -64,7 +64,6 @@ ConfigureTest(STATIC_SET_TEST static_set/heterogeneous_lookup_test.cu static_set/insert_and_find_test.cu static_set/large_input_test.cu - static_set/large_key_type_test.cu static_set/retrieve_test.cu static_set/retrieve_all_test.cu static_set/retrieve_if_test.cu @@ -89,7 +88,6 @@ ConfigureTest(STATIC_MAP_TEST static_map/insert_or_assign_test.cu static_map/insert_or_apply_test.cu static_map/key_sentinel_test.cu - static_map/large_type_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu static_map/rehash_test.cu diff --git a/tests/static_map/contains_test.cu b/tests/static_map/contains_test.cu index e53218933..2cbe5372b 100644 --- a/tests/static_map/contains_test.cu +++ b/tests/static_map/contains_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -130,7 +131,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu index 397af6a4e..7e5deed94 100644 --- a/tests/static_map/duplicate_keys_test.cu +++ b/tests/static_map/duplicate_keys_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -52,7 +53,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{500'000}; diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 518b4fe14..278093d16 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -101,7 +102,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000'000}; diff --git a/tests/static_map/find_test.cu b/tests/static_map/find_test.cu index d26c658aa..94d4b26fb 100644 --- a/tests/static_map/find_test.cu +++ b/tests/static_map/find_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -152,7 +153,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{301}; diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu index 3ce9d9f59..8832ffa84 100644 --- a/tests/static_map/for_each_test.cu +++ b/tests/static_map/for_each_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -99,7 +100,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{100}; using probe = std::conditional_t< diff --git a/tests/static_map/hash_test.cu b/tests/static_map/hash_test.cu index 8acf90b9c..b4f529418 100644 --- a/tests/static_map/hash_test.cu +++ b/tests/static_map/hash_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -62,7 +63,16 @@ void test_hash_function() REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), cuda::std::identity{})); } -TEMPLATE_TEST_CASE_SIG("static_map hash tests", "", ((typename Key)), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_map hash tests", + "", + ((typename Key)), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { test_hash_function>(); test_hash_function>(); diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index cc978ff5b..5b5dde17a 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -17,6 +17,7 @@ #include +#include #include #include @@ -51,7 +52,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { #if !defined(CUCO_HAS_INDEPENDENT_THREADS) if constexpr (cuco::detail::is_packable>()) diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index e14200d5a..b3b2a233d 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -160,7 +161,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{10'000}; constexpr size_type num_unique_keys{100}; @@ -201,8 +210,16 @@ TEMPLATE_TEST_CASE_SIG( } } -TEMPLATE_TEST_CASE_SIG( - "static_map insert_or_apply all unique keys tests", "", ((typename Key)), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_map insert_or_apply all unique keys tests", + "", + ((typename Key)), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { using Value = Key; diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 273fb6e2f..8bd3e0bc0 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -92,7 +93,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index b58145896..edfa499b6 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -32,7 +33,16 @@ struct custom_equals { __device__ bool operator()(T lhs, T rhs) const { return A[lhs] == A[rhs]; } }; -TEMPLATE_TEST_CASE_SIG("static_map key sentinel tests", "", ((typename T), T), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_map key sentinel tests", + "", + ((typename T), T), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { using Key = T; using Value = T; diff --git a/tests/static_map/large_type_test.cu b/tests/static_map/large_type_test.cu deleted file mode 100644 index be6fe605c..000000000 --- a/tests/static_map/large_type_test.cu +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) 2026, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include -#include -#include - -#include - -struct alignas(16) uint128_t { - cuda::std::uint64_t lo; - cuda::std::uint64_t hi; - - __host__ __device__ constexpr bool operator==(uint128_t const& o) const - { - return lo == o.lo and hi == o.hi; - } - __host__ __device__ constexpr bool operator!=(uint128_t const& o) const { return !(*this == o); } -}; - -CUCO_DECLARE_BITWISE_COMPARABLE(uint128_t) - -using size_type = int32_t; - -TEST_CASE("static_map 128-bit packed CAS", "") -{ - using Key = int64_t; - using Value = int64_t; - using probe = cuco::linear_probing<1, cuco::default_hash_function>; - - constexpr size_type num_keys{400}; - - auto map = cuco::static_map, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ - num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; - - auto keys_begin = cuda::counting_iterator(1); - auto pairs_begin = cuda::make_transform_iterator( - keys_begin, cuda::proclaim_return_type>([] __device__(Key const& x) { - return cuco::pair(x, static_cast(x)); - })); - - thrust::device_vector d_contained(num_keys); - - SECTION("insert + contains") - { - auto const inserted = map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(inserted == num_keys); - REQUIRE(map.size() == num_keys); - - map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - } - - SECTION("insert + find") - { - map.insert(pairs_begin, pairs_begin + num_keys); - - thrust::device_vector d_results(num_keys); - map.find(keys_begin, keys_begin + num_keys, d_results.begin()); - - auto zip_equal = cuda::proclaim_return_type( - [] __device__(auto const& p) { return cuda::std::get<0>(p) == cuda::std::get<1>(p); }); - auto zip = thrust::make_zip_iterator( - cuda::std::tuple{d_results.begin(), - cuda::make_transform_iterator( - keys_begin, cuda::proclaim_return_type([] __device__(Key const& x) { - return static_cast(x); - }))}); - REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); - } - - SECTION("insert + erase + re-insert") - { - auto erase_map = cuco::static_map, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{num_keys * 2, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::erased_key{-2}}; - - erase_map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(erase_map.size() == num_keys); - - erase_map.erase(keys_begin, keys_begin + num_keys); - REQUIRE(erase_map.size() == 0); - - erase_map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(erase_map.size() == num_keys); - - erase_map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - } - - SECTION("insert_and_find") - { - thrust::device_vector found1(num_keys); - thrust::device_vector inserted(num_keys); - - map.insert_and_find(pairs_begin, pairs_begin + num_keys, found1.begin(), inserted.begin()); - REQUIRE(cuco::test::all_of(inserted.begin(), inserted.end(), cuda::std::identity{})); - - thrust::device_vector found2(num_keys); - map.insert_and_find(pairs_begin, pairs_begin + num_keys, found2.begin(), inserted.begin()); - REQUIRE(cuco::test::none_of(inserted.begin(), inserted.end(), cuda::std::identity{})); - - REQUIRE( - cuco::test::equal(found1.begin(), found1.end(), found2.begin(), cuda::std::equal_to{})); - } -} - -#if defined(CUCO_HAS_128BIT_ATOMICS) - -TEST_CASE("static_map 128-bit key b2b CAS", "") -{ - using Key = uint128_t; - using Value = int64_t; - using probe = cuco::linear_probing<1, cuco::default_hash_function>; - - constexpr size_type num_keys{400}; - - Key const empty_key{~0ULL, ~0ULL}; - Key const erased_key{~0ULL - 1, ~0ULL}; - - auto map = cuco::static_map, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{num_keys * 2, - cuco::empty_key{empty_key}, - cuco::empty_value{-1}, - cuco::erased_key{erased_key}}; - - auto keys_begin = cuda::make_transform_iterator( - cuda::counting_iterator(1), - cuda::proclaim_return_type( - [] __device__(size_type i) -> Key { return Key{static_cast(i), 0}; })); - - auto pairs_begin = cuda::make_transform_iterator( - cuda::counting_iterator(1), - cuda::proclaim_return_type>([] __device__(size_type i) { - return cuco::pair{Key{static_cast(i), 0}, - static_cast(i)}; - })); - - thrust::device_vector d_contained(num_keys); - - SECTION("insert + contains") - { - auto const inserted = map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(inserted == num_keys); - - map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - } - - SECTION("insert + find") - { - map.insert(pairs_begin, pairs_begin + num_keys); - - thrust::device_vector d_results(num_keys); - map.find(keys_begin, keys_begin + num_keys, d_results.begin()); - - auto zip_equal = cuda::proclaim_return_type( - [] __device__(auto const& p) { return cuda::std::get<0>(p) == cuda::std::get<1>(p); }); - auto gold = - cuda::make_transform_iterator(cuda::counting_iterator(1), - cuda::proclaim_return_type([] __device__(size_type i) { - return static_cast(i); - })); - auto zip = thrust::make_zip_iterator(cuda::std::tuple{d_results.begin(), gold}); - REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); - } - - SECTION("insert + erase + re-insert") - { - map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(map.size() == num_keys); - - map.erase(keys_begin, keys_begin + num_keys); - REQUIRE(map.size() == 0); - - map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - - map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(map.size() == num_keys); - - map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - } -} - -TEST_CASE("static_map 128-bit key and value", "") -{ - using Key = uint128_t; - using Value = uint128_t; - using probe = cuco::linear_probing<1, cuco::default_hash_function>; - - constexpr size_type num_keys{400}; - - Key const empty_key{~0ULL, ~0ULL}; - Value const empty_value{~0ULL, ~0ULL}; - - auto map = cuco::static_map, - cuda::thread_scope_device, - cuda::std::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ - num_keys, cuco::empty_key{empty_key}, cuco::empty_value{empty_value}}; - - auto keys_begin = cuda::make_transform_iterator( - cuda::counting_iterator(1), - cuda::proclaim_return_type( - [] __device__(size_type i) -> Key { return Key{static_cast(i), 0}; })); - - auto pairs_begin = cuda::make_transform_iterator( - cuda::counting_iterator(1), - cuda::proclaim_return_type>([] __device__(size_type i) { - return cuco::pair{Key{static_cast(i), 0}, - Value{static_cast(i * 10), 0}}; - })); - - thrust::device_vector d_contained(num_keys); - - SECTION("insert + contains") - { - auto const inserted = map.insert(pairs_begin, pairs_begin + num_keys); - REQUIRE(inserted == num_keys); - REQUIRE(map.size() == num_keys); - - map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - } - - SECTION("insert + find") - { - map.insert(pairs_begin, pairs_begin + num_keys); - - thrust::device_vector d_results(num_keys); - map.find(keys_begin, keys_begin + num_keys, d_results.begin()); - - auto zip_equal = cuda::proclaim_return_type([] __device__(auto const& p) { - return static_cast(cuda::std::get<0>(p)) == static_cast(cuda::std::get<1>(p)); - }); - auto gold = cuda::make_transform_iterator( - cuda::counting_iterator(1), - cuda::proclaim_return_type([] __device__(size_type i) -> Value { - return Value{static_cast(i * 10), 0}; - })); - auto zip = thrust::make_zip_iterator(cuda::std::tuple{d_results.begin(), gold}); - REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); - } -} - -#endif // CUCO_HAS_128BIT_ATOMICS diff --git a/tests/static_map/retrieve_if_test.cu b/tests/static_map/retrieve_if_test.cu index c91827e57..fe4441822 100644 --- a/tests/static_map/retrieve_if_test.cu +++ b/tests/static_map/retrieve_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -110,7 +111,12 @@ TEMPLATE_TEST_CASE_SIG("static_map retrieve_if", "", ((typename Key, typename T), Key, T), (int32_t, int32_t), - (int64_t, int64_t)) + (int64_t, int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_map/retrieve_test.cu b/tests/static_map/retrieve_test.cu index 1f2a50ccb..6baae057d 100644 --- a/tests/static_map/retrieve_test.cu +++ b/tests/static_map/retrieve_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -119,7 +120,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu index 2261b33b9..6c1701d98 100644 --- a/tests/static_map/stream_test.cu +++ b/tests/static_map/stream_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -34,7 +35,12 @@ TEMPLATE_TEST_CASE_SIG("static_map: unique sequence of keys on given stream", (int32_t, int32_t), (int32_t, int64_t), (int64_t, int32_t), - (int64_t, int64_t)) + (int64_t, int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t) +#endif +) { cudaStream_t stream; CUCO_CUDA_TRY(cudaStreamCreate(&stream)); diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu index aa389dc8d..9b105ae0f 100644 --- a/tests/static_multimap/count_test.cu +++ b/tests/static_multimap/count_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -82,7 +83,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; diff --git a/tests/static_multimap/find_test.cu b/tests/static_multimap/find_test.cu index 747bd47fe..b7c40b715 100644 --- a/tests/static_multimap/find_test.cu +++ b/tests/static_multimap/find_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -109,7 +110,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; diff --git a/tests/static_multimap/for_each_test.cu b/tests/static_multimap/for_each_test.cu index e6eddcc81..fa26f2c3d 100644 --- a/tests/static_multimap/for_each_test.cu +++ b/tests/static_multimap/for_each_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -109,7 +110,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_t num_unique_keys{400}; constexpr size_t key_multiplicity{5}; diff --git a/tests/static_multimap/insert_contains_test.cu b/tests/static_multimap/insert_contains_test.cu index a149e0f0b..4a3b5bf6e 100644 --- a/tests/static_multimap/insert_contains_test.cu +++ b/tests/static_multimap/insert_contains_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -95,7 +96,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{4'000}; diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 038dbee64..71b3ec854 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -86,7 +87,15 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{1'000}; diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index 58104781b..e8677b300 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -91,7 +92,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 8), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 8)) + (int64_t, cuco::test::probe_sequence::linear_probing, 8) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 8), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 8) +#endif +) { constexpr std::size_t num_items{400}; diff --git a/tests/static_multimap/retrieve_if_test.cu b/tests/static_multimap/retrieve_if_test.cu index a1918fbaf..97236d4cb 100644 --- a/tests/static_multimap/retrieve_if_test.cu +++ b/tests/static_multimap/retrieve_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -110,7 +111,12 @@ TEMPLATE_TEST_CASE_SIG("static_multimap retrieve_if", "", ((typename Key, typename Value), Key, Value), (int32_t, int32_t), - (int64_t, int64_t)) + (int64_t, int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_multimap/retrieve_test.cu b/tests/static_multimap/retrieve_test.cu index a5ff9085e..21e913a10 100644 --- a/tests/static_multimap/retrieve_test.cu +++ b/tests/static_multimap/retrieve_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -108,7 +109,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 4), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 4)) + (int64_t, cuco::test::probe_sequence::linear_probing, 4) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 4), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 4) +#endif +) { constexpr std::size_t num_items{1'000}; diff --git a/tests/static_multiset/contains_test.cu b/tests/static_multiset/contains_test.cu index 82397fec2..1a854c9bd 100644 --- a/tests/static_multiset/contains_test.cu +++ b/tests/static_multiset/contains_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -87,7 +88,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_multiset/count_test.cu b/tests/static_multiset/count_test.cu index 797828e26..84d9f4aec 100644 --- a/tests/static_multiset/count_test.cu +++ b/tests/static_multiset/count_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -189,7 +190,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{666}; diff --git a/tests/static_multiset/custom_count_test.cu b/tests/static_multiset/custom_count_test.cu index 2cded9396..6ef2278b9 100644 --- a/tests/static_multiset/custom_count_test.cu +++ b/tests/static_multiset/custom_count_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -129,7 +130,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{555}; diff --git a/tests/static_multiset/find_test.cu b/tests/static_multiset/find_test.cu index ecb763017..b70919859 100644 --- a/tests/static_multiset/find_test.cu +++ b/tests/static_multiset/find_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -100,7 +101,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; diff --git a/tests/static_multiset/for_each_test.cu b/tests/static_multiset/for_each_test.cu index 43b22eee9..5421df342 100644 --- a/tests/static_multiset/for_each_test.cu +++ b/tests/static_multiset/for_each_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -102,7 +103,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_t num_unique_keys{400}; constexpr size_t key_multiplicity{5}; diff --git a/tests/static_multiset/insert_test.cu b/tests/static_multiset/insert_test.cu index 0df7db4d2..60e09ae02 100644 --- a/tests/static_multiset/insert_test.cu +++ b/tests/static_multiset/insert_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -85,7 +86,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_multiset/load_factor_test.cu b/tests/static_multiset/load_factor_test.cu index 4bfbe8a14..154bbf06d 100644 --- a/tests/static_multiset/load_factor_test.cu +++ b/tests/static_multiset/load_factor_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -33,7 +34,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{10}; diff --git a/tests/static_multiset/retrieve_if_test.cu b/tests/static_multiset/retrieve_if_test.cu index 19defd557..d9ad964fa 100644 --- a/tests/static_multiset/retrieve_if_test.cu +++ b/tests/static_multiset/retrieve_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -106,8 +107,16 @@ __global__ void test_retrieve_if_all_true_kernel( *atomic_counter); } -TEMPLATE_TEST_CASE_SIG( - "static_multiset retrieve_if", "", ((typename Key), Key), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_multiset retrieve_if", + "", + ((typename Key), Key), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_multiset/retrieve_test.cu b/tests/static_multiset/retrieve_test.cu index acac841df..3986928f5 100644 --- a/tests/static_multiset/retrieve_test.cu +++ b/tests/static_multiset/retrieve_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -140,7 +141,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{400}; constexpr double desired_load_factor = 0.5; diff --git a/tests/static_multiset/stream_test.cu b/tests/static_multiset/stream_test.cu index 09ceb88f3..1264eb21d 100644 --- a/tests/static_multiset/stream_test.cu +++ b/tests/static_multiset/stream_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -33,7 +34,12 @@ TEMPLATE_TEST_CASE_SIG("static_multiset: operations on different stream than con "", ((typename Key), Key), (int32_t), - (int64_t)) + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { cudaStream_t constructor_stream; cudaStream_t operation_stream; diff --git a/tests/static_set/for_each_test.cu b/tests/static_set/for_each_test.cu index 7db5d0d5f..5374f8fac 100644 --- a/tests/static_set/for_each_test.cu +++ b/tests/static_set/for_each_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -81,7 +82,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; using probe = std::conditional_t< @@ -97,6 +106,6 @@ TEMPLATE_TEST_CASE_SIG( cuco::cuda_allocator, cuco::storage<2>>; - auto set = set_t{num_keys, cuco::empty_key{-1}}; + auto set = set_t{num_keys, cuco::empty_key{static_cast(-1)}}; test_for_each(set, num_keys); } diff --git a/tests/static_set/insert_and_find_test.cu b/tests/static_set/insert_and_find_test.cu index f01ae8884..7eefdfb6c 100644 --- a/tests/static_set/insert_and_find_test.cu +++ b/tests/static_set/insert_and_find_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -62,7 +63,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{400}; @@ -70,8 +79,8 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing>>; - auto set = - cuco::static_set{num_keys, cuco::empty_key{-1}, {}, probe{}, {}, cuco::storage<2>{}}; + auto set = cuco::static_set{ + num_keys, cuco::empty_key{static_cast(-1)}, {}, probe{}, {}, cuco::storage<2>{}}; test_insert_and_find(set, num_keys); } diff --git a/tests/static_set/large_key_type_test.cu b/tests/static_set/large_key_type_test.cu deleted file mode 100644 index 6212f0087..000000000 --- a/tests/static_set/large_key_type_test.cu +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2026, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#if defined(CUCO_HAS_128BIT_ATOMICS) - -#include - -#include - -#include -#include -#include -#include - -#include - -struct alignas(16) uint128_t { - cuda::std::uint64_t lo; - cuda::std::uint64_t hi; - - __host__ __device__ constexpr bool operator==(uint128_t const& o) const - { - return lo == o.lo and hi == o.hi; - } - __host__ __device__ constexpr bool operator!=(uint128_t const& o) const { return !(*this == o); } -}; - -CUCO_DECLARE_BITWISE_COMPARABLE(uint128_t) - -TEST_CASE("static_set 128-bit key unique sequence", "") -{ - using Key = uint128_t; - - constexpr std::size_t num_keys{400}; - - Key const empty_sentinel{~0ULL, ~0ULL}; - - using probe = cuco::linear_probing<1, cuco::default_hash_function>; - - auto set = cuco::static_set{ - num_keys, cuco::empty_key{empty_sentinel}, {}, probe{}, {}, cuco::storage<2>{}}; - - auto keys_begin = cuda::make_transform_iterator( - cuda::counting_iterator(0), - cuda::proclaim_return_type( - [] __device__(std::size_t i) -> Key { return Key{static_cast(i), 0}; })); - - thrust::device_vector d_contained(num_keys); - - SECTION("Non-inserted keys should not be contained.") - { - REQUIRE(set.size() == 0); - - set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::none_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - } - - SECTION("All inserted keys should be contained.") - { - auto const inserted = set.insert(keys_begin, keys_begin + num_keys); - REQUIRE(inserted == num_keys); - REQUIRE(set.size() == num_keys); - - set.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); - REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); - } - - SECTION("All inserted keys should be correctly recovered during find") - { - set.insert(keys_begin, keys_begin + num_keys); - - thrust::device_vector d_results(num_keys); - set.find(keys_begin, keys_begin + num_keys, d_results.begin()); - - auto zip_equal = cuda::proclaim_return_type([] __device__(auto const& p) { - return static_cast(cuda::std::get<0>(p)) == static_cast(cuda::std::get<1>(p)); - }); - auto zip = thrust::make_zip_iterator(cuda::std::tuple{d_results.begin(), keys_begin}); - - REQUIRE(cuco::test::all_of(zip, zip + num_keys, zip_equal)); - } -} - -TEST_CASE("static_set 128-bit key insert_and_find", "") -{ - using Key = uint128_t; - - constexpr std::size_t num_keys{400}; - - Key const empty_sentinel{~0ULL, ~0ULL}; - - using probe = cuco::linear_probing<1, cuco::default_hash_function>; - - auto set = cuco::static_set{ - num_keys, cuco::empty_key{empty_sentinel}, {}, probe{}, {}, cuco::storage<2>{}}; - - auto keys_begin = cuda::make_transform_iterator( - cuda::counting_iterator(0), - cuda::proclaim_return_type( - [] __device__(std::size_t i) -> Key { return Key{static_cast(i), 0}; })); - - thrust::device_vector iters1(num_keys); - thrust::device_vector inserted(num_keys); - - // insert first time, fills inserted with true - set.insert_and_find(keys_begin, keys_begin + num_keys, iters1.begin(), inserted.begin()); - REQUIRE(cuco::test::all_of(inserted.begin(), inserted.end(), cuda::std::identity{})); - - // insert second time, fills inserted with false as keys already in set - thrust::device_vector iters2(num_keys); - set.insert_and_find(keys_begin, keys_begin + num_keys, iters2.begin(), inserted.begin()); - REQUIRE(cuco::test::none_of(inserted.begin(), inserted.end(), cuda::std::identity{})); - - // both iters1 and iters2 should be same, as keys will be referring to same slot - auto equal_fn = cuda::proclaim_return_type( - [] __device__(auto const& a, auto const& b) { return a == b; }); - REQUIRE(cuco::test::equal(iters1.begin(), iters1.end(), iters2.begin(), equal_fn)); -} - -#endif // CUCO_HAS_128BIT_ATOMICS diff --git a/tests/static_set/retrieve_all_test.cu b/tests/static_set/retrieve_all_test.cu index ed187032f..50b4d68a5 100644 --- a/tests/static_set/retrieve_all_test.cu +++ b/tests/static_set/retrieve_all_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -67,7 +68,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{400}; constexpr double desired_load_factor = 1.; @@ -85,7 +94,8 @@ TEMPLATE_TEST_CASE_SIG( } }(); - auto set = cuco::static_set{num_keys, desired_load_factor, cuco::empty_key{-1}, {}, probe{}}; + auto set = cuco::static_set{ + num_keys, desired_load_factor, cuco::empty_key{static_cast(-1)}, {}, probe{}}; REQUIRE(set.capacity() == gold_capacity); diff --git a/tests/static_set/retrieve_if_test.cu b/tests/static_set/retrieve_if_test.cu index 129fe68af..b82969926 100644 --- a/tests/static_set/retrieve_if_test.cu +++ b/tests/static_set/retrieve_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -106,13 +107,22 @@ __global__ void test_retrieve_if_all_true_kernel( *atomic_counter); } -TEMPLATE_TEST_CASE_SIG("static_set retrieve_if", "", ((typename Key), Key), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_set retrieve_if", + "", + ((typename Key), Key), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { constexpr size_type num_keys{400}; using container_type = cuco::static_set; - container_type container{num_keys * 2, cuco::empty_key{-1}}; + container_type container{num_keys * 2, cuco::empty_key{static_cast(-1)}}; auto keys_begin = cuda::counting_iterator(1); auto keys_end = keys_begin + num_keys; diff --git a/tests/static_set/retrieve_test.cu b/tests/static_set/retrieve_test.cu index edbcb4c20..f5b7763dc 100644 --- a/tests/static_set/retrieve_test.cu +++ b/tests/static_set/retrieve_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -81,7 +82,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{400}; constexpr double desired_load_factor = 1.; @@ -90,8 +99,11 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing>>; - auto set = cuco::static_set{ - num_keys, desired_load_factor, cuco::empty_key{key_sentinel}, {}, probe{}}; + auto set = cuco::static_set{num_keys, + desired_load_factor, + cuco::empty_key{static_cast(key_sentinel)}, + {}, + probe{}}; test_unique_sequence(set, num_keys); } diff --git a/tests/static_set/stream_test.cu b/tests/static_set/stream_test.cu index 47e96283f..f9277ed8f 100644 --- a/tests/static_set/stream_test.cu +++ b/tests/static_set/stream_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -33,7 +34,12 @@ TEMPLATE_TEST_CASE_SIG("static_set: operations on different stream than construc "", ((typename Key), Key), (int32_t), - (int64_t)) + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { cudaStream_t constructor_stream; cudaStream_t operation_stream; @@ -43,7 +49,7 @@ TEMPLATE_TEST_CASE_SIG("static_set: operations on different stream than construc { // Scope ensures set is destroyed before streams constexpr std::size_t num_keys{500'000}; auto set = cuco::static_set{num_keys * 2, - cuco::empty_key{-1}, + cuco::empty_key{static_cast(-1)}, {}, cuco::linear_probing<1, cuco::default_hash_function>{}, {}, diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index e43329b6b..2fd0b4761 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -147,7 +148,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; using probe = std::conditional_t{SENTINEL}, {}, probe{}, {}, cuco::storage<2>{}}; + auto set = cuco::static_set{num_keys, + cuco::empty_key{static_cast(SENTINEL)}, + {}, + probe{}, + {}, + cuco::storage<2>{}}; REQUIRE(set.capacity() == gold_capacity); From fa595c146e25956e8272ed92bc1d4c4b609cf1a0 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 17 Apr 2026 14:52:47 -0700 Subject: [PATCH 4/8] Extend 128-bit atomics: helper dedup + bench types + skipped tests --- benchmarks/benchmark_defaults.hpp | 4 +- benchmarks/benchmark_utils.hpp | 2 + .../open_addressing_ref_impl.cuh | 94 +++++++------------ tests/static_map/heterogeneous_lookup_test.cu | 5 + tests/static_map/shared_memory_test.cu | 8 +- .../heterogeneous_lookup_test.cu | 3 + tests/static_multiset/large_input_test.cu | 13 ++- tests/static_set/heterogeneous_lookup_test.cu | 9 +- tests/static_set/large_input_test.cu | 13 ++- tests/static_set/shared_memory_test.cu | 13 ++- 10 files changed, 94 insertions(+), 70 deletions(-) diff --git a/benchmarks/benchmark_defaults.hpp b/benchmarks/benchmark_defaults.hpp index 2fc4439f4..bc4861ae2 100644 --- a/benchmarks/benchmark_defaults.hpp +++ b/benchmarks/benchmark_defaults.hpp @@ -25,8 +25,8 @@ namespace cuco::benchmark::defaults { -using KEY_TYPE_RANGE = nvbench::type_list; -using VALUE_TYPE_RANGE = nvbench::type_list; +using KEY_TYPE_RANGE = nvbench::type_list; +using VALUE_TYPE_RANGE = nvbench::type_list; using HASH_RANGE = nvbench::type_list, cuco::xxhash_32, cuco::xxhash_64, diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp index a9bd690d7..6a4aae833 100644 --- a/benchmarks/benchmark_utils.hpp +++ b/benchmarks/benchmark_utils.hpp @@ -92,3 +92,5 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform, NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian, "GAUSSIAN", "distribution::gaussian"); + +NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "__int128_t"); diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 3d792c9b5..e19c62291 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -545,41 +545,17 @@ class open_addressing_ref_impl { // If the key is already in the container, return false if (eq_res == detail::equal_result::EQUAL) { - if constexpr (has_payload and sizeof(value_type) > 8) { -#if (__CUDA_ARCH__ >= 900) - if constexpr (not cuco::detail::is_packable()) { - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } -#else - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); -#endif - } + this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } if (eq_res == detail::equal_result::AVAILABLE) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { - if constexpr (has_payload and sizeof(value_type) > 8) { -#if (__CUDA_ARCH__ >= 900) - if constexpr (not cuco::detail::is_packable()) { - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } -#else - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); -#endif - } + this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, true}; } case insert_result::DUPLICATE: { - if constexpr (has_payload and sizeof(value_type) > 8) { -#if (__CUDA_ARCH__ >= 900) - if constexpr (not cuco::detail::is_packable()) { - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } -#else - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); -#endif - } + this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } default: continue; @@ -647,17 +623,7 @@ class open_addressing_ref_impl { if (group_finds_equal) { auto const src_lane = __ffs(group_finds_equal) - 1; auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); - if (group.thread_rank() == src_lane) { - if constexpr (has_payload and sizeof(value_type) > 8) { -#if (__CUDA_ARCH__ >= 900) - if constexpr (not cuco::detail::is_packable()) { - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } -#else - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); -#endif - } - } + if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); return {iterator{reinterpret_cast(res)}, false}; } @@ -673,32 +639,12 @@ class open_addressing_ref_impl { switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: { - if (group.thread_rank() == src_lane) { - if constexpr (has_payload and sizeof(value_type) > 8) { -#if (__CUDA_ARCH__ >= 900) - if constexpr (not cuco::detail::is_packable()) { - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } -#else - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); -#endif - } - } + if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); return {iterator{reinterpret_cast(res)}, true}; } case insert_result::DUPLICATE: { - if (group.thread_rank() == src_lane) { - if constexpr (has_payload and sizeof(value_type) > 8) { -#if (__CUDA_ARCH__ >= 900) - if constexpr (not cuco::detail::is_packable()) { - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } -#else - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); -#endif - } - } + if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); return {iterator{reinterpret_cast(res)}, false}; } @@ -1973,6 +1919,34 @@ class open_addressing_ref_impl { } while (cuco::detail::bitwise_compare(current, sentinel)); } + /** + * @brief Conditionally spin-waits for the payload of a non-atomically inserted slot to become + * visible. + * + * For containers where the key and value are inserted by separate instructions + * (`cas_dependent_write` / `back_to_back_cas`), an observer thread may see the key before the + * payload. This helper spins until the payload is visible. For atomic single-CAS paths (slot + * size <= 8 bytes, or a packable slot on sm_90+ via `atom.cas.b128`), the payload is already + * visible and this is a no-op. + * + * @tparam SlotPtr Pointer-like type to a slot holding a `.second` payload member + * + * @param slot_ptr Pointer to the slot whose payload may need waiting on + */ + template + __device__ void maybe_wait_for_payload(SlotPtr slot_ptr) noexcept + { + if constexpr (has_payload and sizeof(value_type) > 8) { +#if (__CUDA_ARCH__ >= 900) + if constexpr (not cuco::detail::is_packable()) { + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); + } +#else + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); +#endif + } + } + // TODO: Clean up the sentinel handling since it's duplicated in ref and equal wrapper value_type empty_slot_sentinel_; ///< Sentinel value indicating an empty slot detail::equal_wrapper diff --git a/tests/static_map/heterogeneous_lookup_test.cu b/tests/static_map/heterogeneous_lookup_test.cu index b168d861d..3b8b0d023 100644 --- a/tests/static_map/heterogeneous_lookup_test.cu +++ b/tests/static_map/heterogeneous_lookup_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -88,6 +89,10 @@ TEMPLATE_TEST_CASE_SIG("static_map heterogeneous lookup tests", (int64_t, 1), (int64_t, 2), #endif +#if defined(CUCO_HAS_128BIT_ATOMICS) + (__int128_t, 1), + (__int128_t, 2), +#endif (int32_t, 1), (int32_t, 2)) diff --git a/tests/static_map/shared_memory_test.cu b/tests/static_map/shared_memory_test.cu index cac1d08cd..15731319c 100644 --- a/tests/static_map/shared_memory_test.cu +++ b/tests/static_map/shared_memory_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -72,7 +73,12 @@ TEMPLATE_TEST_CASE_SIG("static_map shared memory tests", (int32_t, int32_t), (int32_t, int64_t), (int64_t, int32_t), - (int64_t, int64_t)) + (int64_t, int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t) +#endif +) { constexpr std::size_t number_of_maps = 1000; constexpr std::size_t elements_in_map = 500; diff --git a/tests/static_multimap/heterogeneous_lookup_test.cu b/tests/static_multimap/heterogeneous_lookup_test.cu index e99ef6384..97a382d96 100644 --- a/tests/static_multimap/heterogeneous_lookup_test.cu +++ b/tests/static_multimap/heterogeneous_lookup_test.cu @@ -78,6 +78,9 @@ struct custom_key_equal { } }; +// TODO: extend with __int128_t once the multimap can handle slots larger than +// 32 bytes (key_pair<__int128_t> is 32 bytes, so pair, V> +// exceeds the current slot-size budget). TEMPLATE_TEST_CASE( "static_multimap heterogeneous lookup tests", "", diff --git a/tests/static_multiset/large_input_test.cu b/tests/static_multiset/large_input_test.cu index 37e2dedcb..20cd43217 100644 --- a/tests/static_multiset/large_input_test.cu +++ b/tests/static_multiset/large_input_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -56,9 +57,17 @@ TEMPLATE_TEST_CASE_SIG( "", ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), (int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int64_t, cuco::test::probe_sequence::double_hashing, 2)) + (int64_t, cuco::test::probe_sequence::double_hashing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2) +#endif +) { - constexpr std::size_t num_keys{1'200'000'000}; + // Reduce the key count for 16-byte keys to stay within GPU memory. + // 1.2B * 8B * 2 (capacity) = 19.2GB; 300M * 16B * 2 = 9.6GB. + constexpr std::size_t num_keys = (sizeof(Key) >= 16) ? 300'000'000 : 1'200'000'000; using extent_type = cuco::extent; using probe = cuco::double_hashing>; diff --git a/tests/static_set/heterogeneous_lookup_test.cu b/tests/static_set/heterogeneous_lookup_test.cu index 57a1b291a..10e471ffe 100644 --- a/tests/static_set/heterogeneous_lookup_test.cu +++ b/tests/static_set/heterogeneous_lookup_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -84,7 +85,13 @@ TEMPLATE_TEST_CASE_SIG("static_set heterogeneous lookup tests", "", ((typename T, int CGSize), T, CGSize), (int32_t, 1), - (int32_t, 2)) + (int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, 1), + (__int128_t, 2) +#endif +) { using Key = T; using InsertKey = key_pair; diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu index 67daeafc6..b3096ee5f 100644 --- a/tests/static_set/large_input_test.cu +++ b/tests/static_set/large_input_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -74,9 +75,17 @@ TEMPLATE_TEST_CASE_SIG( "", ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), (int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int64_t, cuco::test::probe_sequence::double_hashing, 2)) + (int64_t, cuco::test::probe_sequence::double_hashing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2) +#endif +) { - constexpr std::size_t num_keys{1'200'000'000}; + // Reduce the key count for 16-byte keys to stay within GPU memory. + // 1.2B * 8B * 2 (capacity) = 19.2GB; 300M * 16B * 2 = 9.6GB. + constexpr std::size_t num_keys = (sizeof(Key) >= 16) ? 300'000'000 : 1'200'000'000; using extent_type = cuco::extent; using probe = cuco::double_hashing>; diff --git a/tests/static_set/shared_memory_test.cu b/tests/static_set/shared_memory_test.cu index 374271e9a..08f805927 100644 --- a/tests/static_set/shared_memory_test.cu +++ b/tests/static_set/shared_memory_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -63,8 +64,16 @@ __global__ void shared_memory_test_kernel(Ref* sets, } } -TEMPLATE_TEST_CASE_SIG( - "static_set shared memory tests", "", ((typename Key), Key), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_set shared memory tests", + "", + ((typename Key), Key), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { constexpr std::size_t number_of_sets = 1000; constexpr std::size_t elements_in_set = 500; From 6a2a1d2a06311b16e56aeec8bfa452a745214074 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Fri, 17 Apr 2026 15:43:06 -0700 Subject: [PATCH 5/8] Address review comments --- benchmarks/benchmark_defaults.hpp | 14 +++++++++---- benchmarks/benchmark_utils.hpp | 3 +++ include/cuco/detail/__config | 3 +++ include/cuco/detail/bitwise_compare.cuh | 8 ++++--- .../open_addressing/open_addressing_impl.cuh | 6 ------ .../open_addressing_ref_impl.cuh | 21 ++++++++++++------- include/cuco/detail/pair/pair.inl | 4 ++-- include/cuco/detail/static_map/static_map.inl | 6 +++--- tests/static_map/insert_or_apply_test.cu | 5 ++--- tests/static_map/insert_or_assign_test.cu | 5 ++--- tests/static_map/retrieve_test.cu | 5 ++--- tests/static_multimap/insert_contains_test.cu | 5 ++--- tests/static_multimap/insert_if_test.cu | 5 ++--- 13 files changed, 49 insertions(+), 41 deletions(-) diff --git a/benchmarks/benchmark_defaults.hpp b/benchmarks/benchmark_defaults.hpp index bc4861ae2..ac7462fbc 100644 --- a/benchmarks/benchmark_defaults.hpp +++ b/benchmarks/benchmark_defaults.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -25,12 +26,17 @@ namespace cuco::benchmark::defaults { +#if defined(CUCO_HAS_128BIT_ATOMICS) using KEY_TYPE_RANGE = nvbench::type_list; using VALUE_TYPE_RANGE = nvbench::type_list; -using HASH_RANGE = nvbench::type_list, - cuco::xxhash_32, - cuco::xxhash_64, - cuco::murmurhash3_32>; //, +#else +using KEY_TYPE_RANGE = nvbench::type_list; +using VALUE_TYPE_RANGE = nvbench::type_list; +#endif +using HASH_RANGE = nvbench::type_list, + cuco::xxhash_32, + cuco::xxhash_64, + cuco::murmurhash3_32>; //, // cuco::murmurhash3_x86_128, // cuco::murmurhash3_x64_128>; // TODO handle tuple-like hash value diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp index 6a4aae833..e0acc0bb9 100644 --- a/benchmarks/benchmark_utils.hpp +++ b/benchmarks/benchmark_utils.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -93,4 +94,6 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian, "GAUSSIAN", "distribution::gaussian"); +#if defined(CUCO_HAS_128BIT_ATOMICS) NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "__int128_t"); +#endif diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index 1cf231a05..83b8f146d 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -73,9 +73,12 @@ inline constexpr std::size_t max_key_size = #endif /// Maximum supported payload/mapped type size (in bytes) for open-addressing containers. +/// Tied to `max_key_size`: a slot stores at most a key plus an equally-sized payload. inline constexpr std::size_t max_payload_size = max_key_size; /// Maximum supported slot size (in bytes) for open-addressing containers. +/// Tied to `max_key_size`: a slot stores at most a key plus an equally-sized payload +/// (i.e., `sizeof(pair) <= 2 * max_key_size`). inline constexpr std::size_t max_slot_size = 2 * max_key_size; /// Checks if the given size is a valid mapped_type size for packed CAS operations. diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index 4c4afd411..8277a1079 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -63,15 +64,16 @@ struct bitwise_compare_impl<8> { } }; +#ifdef CUCO_HAS_INT128 template <> struct bitwise_compare_impl<16> { __host__ __device__ inline static bool compare(char const* lhs, char const* rhs) { - return *reinterpret_cast(lhs) == *reinterpret_cast(rhs) and - *reinterpret_cast(lhs + 8) == - *reinterpret_cast(rhs + 8); + return *reinterpret_cast(lhs) == + *reinterpret_cast(rhs); } }; +#endif /** * @brief Gives value to use as alignment for a type that is at least the diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 830d60ff2..c079ef7a6 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -69,12 +69,6 @@ template class open_addressing_impl { - static_assert(sizeof(Key) <= cuco::detail::max_key_size, - "Key size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); - - static_assert(sizeof(Value) <= cuco::detail::max_slot_size, - "Slot size exceeds the maximum supported size (16 bytes, or 32 with sm_90+)."); - static_assert( cuco::is_bitwise_comparable_v, "Key type must have unique object representations or have been explicitly declared as safe for " diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index e19c62291..88fcd2fc0 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -92,6 +92,9 @@ class open_addressing_ref_impl { static_assert(sizeof(Key) <= cuco::detail::max_key_size, "Key size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); + static_assert(sizeof(typename StorageRef::value_type) <= cuco::detail::max_slot_size, + "Slot size exceeds the maximum supported size (16 bytes, or 32 with sm_90+)."); + static_assert( cuco::is_bitwise_comparable_v, "Key type must have unique object representations or have been explicitly declared as safe for " @@ -1847,15 +1850,16 @@ class open_addressing_ref_impl { return packed_cas(address, expected, desired); } #endif - else { - static_assert( - has_payload, - "16-byte key types in key-only containers require sm_90+ for 128-bit atomic CAS."); + else if constexpr (has_payload) { #if (__CUDA_ARCH__ < 700) return cas_dependent_write(address, expected, desired); #else return back_to_back_cas(address, expected, desired); #endif + } else { + static_assert(cuco::dependent_false, + "No valid atomic CAS path: 16-byte key in a key-only container must be " + "packable (have unique object representations) and target sm_90+."); } } @@ -1889,11 +1893,12 @@ class open_addressing_ref_impl { return packed_cas(address, expected, desired); } #endif - else { - static_assert( - has_payload, - "16-byte key types in key-only containers require sm_90+ for 128-bit atomic CAS."); + else if constexpr (has_payload) { return cas_dependent_write(address, expected, desired); + } else { + static_assert(cuco::dependent_false, + "No valid atomic CAS path: 16-byte key in a key-only container must be " + "packable (have unique object representations) and target sm_90+."); } } diff --git a/include/cuco/detail/pair/pair.inl b/include/cuco/detail/pair/pair.inl index 5a1715125..359829db3 100644 --- a/include/cuco/detail/pair/pair.inl +++ b/include/cuco/detail/pair/pair.inl @@ -23,14 +23,14 @@ namespace cuco { template __host__ __device__ constexpr pair::pair(First const& f, Second const& s) - : first(f), second(s) + : first{f}, second{s} { } template template __host__ __device__ constexpr pair::pair(pair const& p) - : first(p.first), second(p.second) + : first{p.first}, second{p.second} { } diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 1d6cbbd67..9b143d8bb 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -55,7 +55,7 @@ constexpr static_map Date: Tue, 21 Apr 2026 17:26:54 -0700 Subject: [PATCH 6/8] Unify static asserts in OA ref --- .../cuco/detail/open_addressing/open_addressing_impl.cuh | 5 ----- include/cuco/static_map.cuh | 6 ------ include/cuco/static_map_ref.cuh | 5 ----- include/cuco/static_multimap.cuh | 6 ------ include/cuco/static_multimap_ref.cuh | 5 ----- 5 files changed, 27 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index c079ef7a6..43ad9c9ef 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -69,11 +69,6 @@ template class open_addressing_impl { - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - static_assert(cuda::std::is_base_of_v, ProbingScheme>, "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 45d85c9b1..46d679e76 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -95,12 +95,6 @@ template >, class Storage = cuco::storage<1>> class static_map { - static_assert(sizeof(Key) <= cuco::detail::max_key_size, - "Key size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); - - static_assert(sizeof(T) <= cuco::detail::max_payload_size, - "Payload size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); - static_assert(cuco::is_bitwise_comparable_v, "Mapped type must have unique object representations or have been explicitly " "declared as safe for bitwise comparison via specialization of " diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index be23b6dd4..187852ea6 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -78,11 +78,6 @@ class static_map_ref static_assert(cuco::detail::is_valid_mapped_size(sizeof(T)), "sizeof(mapped_type) must be 4 or 8 bytes (or 16 with sm_90+)."); - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - public: using key_type = Key; ///< Key type using mapped_type = T; ///< Mapped type diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 957630a77..d47c8f0c7 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -91,12 +91,6 @@ template >, class Storage = cuco::storage<2>> class static_multimap { - static_assert(sizeof(Key) <= cuco::detail::max_key_size, - "Key size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); - - static_assert(sizeof(T) <= cuco::detail::max_payload_size, - "Payload size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); - static_assert(cuco::is_bitwise_comparable_v, "Mapped type must have unique object representations or have been explicitly " "declared as safe for bitwise comparison via specialization of " diff --git a/include/cuco/static_multimap_ref.cuh b/include/cuco/static_multimap_ref.cuh index 7dee8fb2b..b2b780795 100644 --- a/include/cuco/static_multimap_ref.cuh +++ b/include/cuco/static_multimap_ref.cuh @@ -75,11 +75,6 @@ class static_multimap_ref using impl_type = detail:: open_addressing_ref_impl; - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - public: using key_type = Key; ///< Key type using mapped_type = T; ///< Mapped type From d34ebbb73c84fa68830a84ee5bf3dd6068f16768 Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Tue, 21 Apr 2026 17:43:00 -0700 Subject: [PATCH 7/8] More divers test combinations --- tests/static_map/contains_test.cu | 5 ++--- tests/static_map/duplicate_keys_test.cu | 5 ++--- tests/static_map/erase_test.cu | 5 ++--- tests/static_map/find_test.cu | 5 ++--- tests/static_map/for_each_test.cu | 5 ++--- tests/static_map/insert_and_find_test.cu | 5 ++--- 6 files changed, 12 insertions(+), 18 deletions(-) diff --git a/tests/static_map/contains_test.cu b/tests/static_map/contains_test.cu index 2cbe5372b..0b3604528 100644 --- a/tests/static_map/contains_test.cu +++ b/tests/static_map/contains_test.cu @@ -134,10 +134,9 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , - (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu index 7e5deed94..9a3422f0b 100644 --- a/tests/static_map/duplicate_keys_test.cu +++ b/tests/static_map/duplicate_keys_test.cu @@ -56,10 +56,9 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , - (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 278093d16..8bf09abe2 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -105,10 +105,9 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , - (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { diff --git a/tests/static_map/find_test.cu b/tests/static_map/find_test.cu index 94d4b26fb..7e1289bb6 100644 --- a/tests/static_map/find_test.cu +++ b/tests/static_map/find_test.cu @@ -156,10 +156,9 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , - (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu index 8832ffa84..591ad83b9 100644 --- a/tests/static_map/for_each_test.cu +++ b/tests/static_map/for_each_test.cu @@ -103,10 +103,9 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , - (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index 5b5dde17a..17665eb2b 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -55,10 +55,9 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , - (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 1), (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 1), - (__int128_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) #endif ) { From 92a8d3dc50aa937f0382f50212dc2d5898f1fc1d Mon Sep 17 00:00:00 2001 From: Daniel Juenger Date: Thu, 23 Apr 2026 14:31:51 -0700 Subject: [PATCH 8/8] Enable sm_90 code path in CI --- ci/matrix.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/matrix.yml b/ci/matrix.yml index 018638e1a..c24c10212 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -37,7 +37,7 @@ devcontainer_version: '26.04' pull_request: nvcc: - {cuda: *cuda_oldest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'g++'}, gpu_build_archs: '70,80', std: [17], jobs: ['build', 'test']} - - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build', 'test']} + - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '90,100', std: [17], jobs: ['build', 'test']} - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'arm64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build']} - {cuda: *cuda_oldest, os: 'ubuntu20.04', cpu: 'amd64', compiler: {name: 'llvm', version: '14', exe: 'clang++'}, gpu_build_archs: '70', std: [17], jobs: ['build']} - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'llvm', version: '21', exe: 'clang++'}, gpu_build_archs: '100', std: [17], jobs: ['build']}