diff --git a/CMakeLists.txt b/CMakeLists.txt index 884df20f6..483ee5ba8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) set(rapids-cmake-version 26.04) +set(rapids-cmake-branch "release/${rapids-cmake-version}") if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/release/${rapids-cmake-version}/RAPIDS.cmake @@ -74,7 +75,7 @@ rapids_find_package( ################################################################################################### # - find packages we depend on -------------------------------------------------------------------- -rapids_cpm_init() +rapids_cpm_init(OVERRIDE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cccl_override.json") include(cmake/thirdparty/get_cccl.cmake) diff --git a/benchmarks/benchmark_defaults.hpp b/benchmarks/benchmark_defaults.hpp index 2fc4439f4..ac7462fbc 100644 --- a/benchmarks/benchmark_defaults.hpp +++ b/benchmarks/benchmark_defaults.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -25,12 +26,17 @@ namespace cuco::benchmark::defaults { +#if defined(CUCO_HAS_128BIT_ATOMICS) +using KEY_TYPE_RANGE = nvbench::type_list; +using VALUE_TYPE_RANGE = nvbench::type_list; +#else using KEY_TYPE_RANGE = nvbench::type_list; using VALUE_TYPE_RANGE = nvbench::type_list; -using HASH_RANGE = nvbench::type_list, - cuco::xxhash_32, - cuco::xxhash_64, - cuco::murmurhash3_32>; //, +#endif +using HASH_RANGE = nvbench::type_list, + cuco::xxhash_32, + cuco::xxhash_64, + cuco::murmurhash3_32>; //, // cuco::murmurhash3_x86_128, // cuco::murmurhash3_x64_128>; // TODO handle tuple-like hash value diff --git a/benchmarks/benchmark_utils.hpp b/benchmarks/benchmark_utils.hpp index a9bd690d7..e0acc0bb9 100644 --- a/benchmarks/benchmark_utils.hpp +++ b/benchmarks/benchmark_utils.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -92,3 +93,7 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform, NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian, "GAUSSIAN", "distribution::gaussian"); + +#if defined(CUCO_HAS_128BIT_ATOMICS) +NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "__int128_t"); +#endif diff --git a/ci/matrix.yml b/ci/matrix.yml index 018638e1a..c24c10212 100644 --- a/ci/matrix.yml +++ b/ci/matrix.yml @@ -37,7 +37,7 @@ devcontainer_version: '26.04' pull_request: nvcc: - {cuda: *cuda_oldest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'g++'}, gpu_build_archs: '70,80', std: [17], jobs: ['build', 'test']} - - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build', 'test']} + - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '90,100', std: [17], jobs: ['build', 'test']} - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'arm64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build']} - {cuda: *cuda_oldest, os: 'ubuntu20.04', cpu: 'amd64', compiler: {name: 'llvm', version: '14', exe: 'clang++'}, gpu_build_archs: '70', std: [17], jobs: ['build']} - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'llvm', version: '21', exe: 'clang++'}, gpu_build_archs: '100', std: [17], jobs: ['build']} diff --git a/cmake/cccl_override.json b/cmake/cccl_override.json new file mode 100644 index 000000000..32dc723ca --- /dev/null +++ b/cmake/cccl_override.json @@ -0,0 +1,17 @@ +{ + "packages": { + "CCCL": { + "version": "3.3.0", + "git_url": "https://github.com/NVIDIA/cccl.git", + "git_tag": "09094af138841ef521de1adbbdd18ab8b3dad47b", + "git_shallow": false, + "patches": [ + { + "file": "${current_json_dir}/patches/cccl_fix_128bit_cas.patch", + "issue": "Fix 128-bit atomic CAS operand indices [https://github.com/NVIDIA/cccl/issues/8402]", + "fixed_in": "3.3.2" + } + ] + } + } +} diff --git a/cmake/patches/cccl_fix_128bit_cas.patch b/cmake/patches/cccl_fix_128bit_cas.patch new file mode 100644 index 000000000..fc17651e4 --- /dev/null +++ b/cmake/patches/cccl_fix_128bit_cas.patch @@ -0,0 +1,238 @@ +From 1898944000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Daniel Juenger +Date: Mon, 14 Apr 2026 00:00:00 +0000 +Subject: [PATCH] Fix codegen in 128bit atomic CAS (#8403) + +Fix wrong inline asm operand indices in all atom.cas.*.b128 variants. +See https://github.com/NVIDIA/cccl/issues/8402 +--- + .../cuda/std/__atomic/functions/cuda_ptx_generated.h | 80 ++++++++++---------- + 1 file changed, 40 insertions(+), 40 deletions(-) + +diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +index f3e30d53039..479815f4136 100644 +--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h ++++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h +@@ -1585,8 +1585,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acquire.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1604,8 +1604,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acquire.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1623,8 +1623,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acquire.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1642,8 +1642,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acquire.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1661,8 +1661,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.relaxed.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1680,8 +1680,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1699,8 +1699,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1718,8 +1718,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.relaxed.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1737,8 +1737,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.release.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1756,8 +1756,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.release.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1775,8 +1775,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.release.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1794,8 +1794,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.release.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1813,8 +1813,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1832,8 +1832,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1851,8 +1851,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1870,8 +1870,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1889,8 +1889,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.cta.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1908,8 +1908,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.cluster.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1927,8 +1927,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.gpu.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +@@ -1946,8 +1946,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange( + { + .reg .b128 _d; + .reg .b128 _v; +- mov.b128 _d, {%0, %1}; +- mov.b128 _v, {%4, %5}; ++ mov.b128 _d, {%3, %4}; ++ mov.b128 _v, {%5, %6}; + atom.cas.sys.b128 _d,[%2],_d,_v; + mov.b128 {%0, %1}, _d; + } +-- +2.45.2 + diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config index c2cc9066a..83b8f146d 100644 --- a/include/cuco/detail/__config +++ b/include/cuco/detail/__config @@ -19,6 +19,8 @@ #include #include +#include + #if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__) #error "NVCC version not found" #elif __CUDACC_VER_MAJOR__ < 12 @@ -52,6 +54,41 @@ #define CUCO_HAS_INT128 #endif +#if defined(CUCO_HAS_INT128) && (CUCO_CUDA_MINIMUM_ARCH >= 900) +#define CUCO_HAS_128BIT_ATOMICS +#endif + #if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000) #define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC #endif + +namespace cuco::detail { + +/// Maximum supported key size (in bytes) for open-addressing containers. +inline constexpr std::size_t max_key_size = +#if defined(CUCO_HAS_128BIT_ATOMICS) + 16; +#else + 8; +#endif + +/// Maximum supported payload/mapped type size (in bytes) for open-addressing containers. +/// Tied to `max_key_size`: a slot stores at most a key plus an equally-sized payload. +inline constexpr std::size_t max_payload_size = max_key_size; + +/// Maximum supported slot size (in bytes) for open-addressing containers. +/// Tied to `max_key_size`: a slot stores at most a key plus an equally-sized payload +/// (i.e., `sizeof(pair) <= 2 * max_key_size`). +inline constexpr std::size_t max_slot_size = 2 * max_key_size; + +/// Checks if the given size is a valid mapped_type size for packed CAS operations. +inline constexpr bool is_valid_mapped_size(std::size_t n) +{ + return n == 4 or n == 8 +#if defined(CUCO_HAS_128BIT_ATOMICS) + or n == 16 +#endif + ; +} + +} // namespace cuco::detail diff --git a/include/cuco/detail/bitwise_compare.cuh b/include/cuco/detail/bitwise_compare.cuh index 1b9e80f4d..8277a1079 100644 --- a/include/cuco/detail/bitwise_compare.cuh +++ b/include/cuco/detail/bitwise_compare.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -63,6 +64,17 @@ struct bitwise_compare_impl<8> { } }; +#ifdef CUCO_HAS_INT128 +template <> +struct bitwise_compare_impl<16> { + __host__ __device__ inline static bool compare(char const* lhs, char const* rhs) + { + return *reinterpret_cast(lhs) == + *reinterpret_cast(rhs); + } +}; +#endif + /** * @brief Gives value to use as alignment for a type that is at least the * size of type, or 16, whichever is smaller. diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index 55d423e97..43ad9c9ef 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -69,15 +69,6 @@ template class open_addressing_impl { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); - - static_assert(sizeof(Value) <= 16, "Container does not support slot types larger than 16 bytes."); - - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - static_assert(cuda::std::is_base_of_v, ProbingScheme>, "ProbingScheme must inherit from cuco::detail::probing_scheme_base"); diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index be3612d13..c352b2a1d 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -90,7 +90,11 @@ template class open_addressing_ref_impl { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); + static_assert(sizeof(Key) <= cuco::detail::max_key_size, + "Key size exceeds the maximum supported size (8 bytes, or 16 with sm_90+)."); + + static_assert(sizeof(typename StorageRef::value_type) <= cuco::detail::max_slot_size, + "Slot size exceeds the maximum supported size (16 bytes, or 32 with sm_90+)."); static_assert( cuco::is_bitwise_comparable_v, @@ -524,9 +528,9 @@ class open_addressing_ref_impl { #if __CUDA_ARCH__ < 700 // Spinning to ensure that the write to the value part took place requires // independent thread scheduling introduced with the Volta architecture. - static_assert( - cuco::detail::is_packable(), - "insert_and_find is not supported for pair types larger than 8 bytes on pre-Volta GPUs."); + static_assert(sizeof(value_type) <= 8, + "insert_and_find is not supported for slot types larger than 8 bytes on " + "pre-Volta GPUs."); #endif auto const val = this->heterogeneous_value(value); @@ -545,26 +549,17 @@ class open_addressing_ref_impl { // If the key is already in the container, return false if (eq_res == detail::equal_result::EQUAL) { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } + this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } if (eq_res == detail::equal_result::AVAILABLE) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } + this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, true}; } case insert_result::DUPLICATE: { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } + this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } default: continue; @@ -599,9 +594,9 @@ class open_addressing_ref_impl { #if __CUDA_ARCH__ < 700 // Spinning to ensure that the write to the value part took place requires // independent thread scheduling introduced with the Volta architecture. - static_assert( - cuco::detail::is_packable(), - "insert_and_find is not supported for pair types larger than 8 bytes on pre-Volta GPUs."); + static_assert(sizeof(value_type) <= 8, + "insert_and_find is not supported for slot types larger than 8 bytes on " + "pre-Volta GPUs."); #endif auto const val = this->heterogeneous_value(value); @@ -632,12 +627,7 @@ class open_addressing_ref_impl { if (group_finds_equal) { auto const src_lane = __ffs(group_finds_equal) - 1; auto const res = group.shfl(reinterpret_cast(slot_ptr), src_lane); - if (group.thread_rank() == src_lane) { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } - } + if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); return {iterator{reinterpret_cast(res)}, false}; } @@ -653,22 +643,12 @@ class open_addressing_ref_impl { switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: { - if (group.thread_rank() == src_lane) { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } - } + if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); return {iterator{reinterpret_cast(res)}, true}; } case insert_result::DUPLICATE: { - if (group.thread_rank() == src_lane) { - if constexpr (has_payload) { - // wait to ensure that the write to the value part also took place - this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); - } - } + if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); return {iterator{reinterpret_cast(res)}, false}; } @@ -1730,8 +1710,7 @@ class open_addressing_ref_impl { value_type expected, Value desired) noexcept { - using packed_type = - cuda::std::conditional_t; + using packed_type = cuco::detail::packed_t; auto* slot_ptr = reinterpret_cast(address); auto* expected_ptr = reinterpret_cast(&expected); @@ -1866,12 +1845,22 @@ class open_addressing_ref_impl { { if constexpr (sizeof(value_type) <= 8) { return packed_cas(address, expected, desired); - } else { + } +#if (__CUDA_ARCH__ >= 900) + else if constexpr (cuco::detail::is_packable()) { + return packed_cas(address, expected, desired); + } +#endif + else if constexpr (has_payload) { #if (__CUDA_ARCH__ < 700) return cas_dependent_write(address, expected, desired); #else return back_to_back_cas(address, expected, desired); #endif + } else { + static_assert(cuco::dependent_false, + "No valid atomic CAS path: 16-byte key in a key-only container must be " + "packable (have unique object representations) and target sm_90+."); } } @@ -1899,8 +1888,18 @@ class open_addressing_ref_impl { { if constexpr (sizeof(value_type) <= 8) { return packed_cas(address, expected, desired); - } else { + } +#if (__CUDA_ARCH__ >= 900) + else if constexpr (cuco::detail::is_packable()) { + return packed_cas(address, expected, desired); + } +#endif + else if constexpr (has_payload) { return cas_dependent_write(address, expected, desired); + } else { + static_assert(cuco::dependent_false, + "No valid atomic CAS path: 16-byte key in a key-only container must be " + "packable (have unique object representations) and target sm_90+."); } } @@ -1926,6 +1925,34 @@ class open_addressing_ref_impl { } while (cuco::detail::bitwise_compare(current, sentinel)); } + /** + * @brief Conditionally spin-waits for the payload of a non-atomically inserted slot to become + * visible. + * + * For containers where the key and value are inserted by separate instructions + * (`cas_dependent_write` / `back_to_back_cas`), an observer thread may see the key before the + * payload. This helper spins until the payload is visible. For atomic single-CAS paths (slot + * size <= 8 bytes, or a packable slot on sm_90+ via `atom.cas.b128`), the payload is already + * visible and this is a no-op. + * + * @tparam SlotPtr Pointer-like type to a slot holding a `.second` payload member + * + * @param slot_ptr Pointer to the slot whose payload may need waiting on + */ + template + __device__ void maybe_wait_for_payload(SlotPtr slot_ptr) noexcept + { + if constexpr (has_payload and sizeof(value_type) > 8) { +#if (__CUDA_ARCH__ >= 900) + if constexpr (not cuco::detail::is_packable()) { + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); + } +#else + this->wait_for_payload(slot_ptr->second, this->empty_value_sentinel()); +#endif + } + } + // TODO: Clean up the sentinel handling since it's duplicated in ref and equal wrapper value_type empty_slot_sentinel_; ///< Sentinel value indicating an empty slot detail::equal_wrapper diff --git a/include/cuco/detail/pair/helpers.cuh b/include/cuco/detail/pair/helpers.cuh index 063e1e783..c21cbdb86 100644 --- a/include/cuco/detail/pair/helpers.cuh +++ b/include/cuco/detail/pair/helpers.cuh @@ -15,6 +15,8 @@ #pragma once +#include + #include #include #include @@ -61,6 +63,17 @@ struct packed { using type = uint32_t; ///< Packed type as `uint32_t` if the size of the object is 4 }; +#ifdef CUCO_HAS_INT128 +/** + * @brief Denotes the packed type when the size of the object is 16. + */ +template <> +struct packed<16> { + using type = unsigned __int128; ///< Packed type as `unsigned __int128` if the size of the object + ///< is 16 +}; +#endif + template using packed_t = typename packed::type; diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index b70fa593a..46d679e76 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -95,10 +95,6 @@ template >, class Storage = cuco::storage<1>> class static_map { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); - - static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); - static_assert(cuco::is_bitwise_comparable_v, "Mapped type must have unique object representations or have been explicitly " "declared as safe for bitwise comparison via specialization of " diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 24525c3bb..187852ea6 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -75,13 +75,8 @@ class static_map_ref using impl_type = detail:: open_addressing_ref_impl; - static_assert(sizeof(T) == 4 or sizeof(T) == 8, - "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); - - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); + static_assert(cuco::detail::is_valid_mapped_size(sizeof(T)), + "sizeof(mapped_type) must be 4 or 8 bytes (or 16 with sm_90+)."); public: using key_type = Key; ///< Key type diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index aee2c24bd..d47c8f0c7 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -91,10 +91,6 @@ template >, class Storage = cuco::storage<2>> class static_multimap { - static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); - - static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); - static_assert(cuco::is_bitwise_comparable_v, "Mapped type must have unique object representations or have been explicitly " "declared as safe for bitwise comparison via specialization of " diff --git a/include/cuco/static_multimap_ref.cuh b/include/cuco/static_multimap_ref.cuh index 7dee8fb2b..b2b780795 100644 --- a/include/cuco/static_multimap_ref.cuh +++ b/include/cuco/static_multimap_ref.cuh @@ -75,11 +75,6 @@ class static_multimap_ref using impl_type = detail:: open_addressing_ref_impl; - static_assert( - cuco::is_bitwise_comparable_v, - "Key type must have unique object representations or have been explicitly declared as safe for " - "bitwise comparison via specialization of cuco::is_bitwise_comparable_v."); - public: using key_type = Key; ///< Key type using mapped_type = T; ///< Mapped type diff --git a/tests/static_map/contains_test.cu b/tests/static_map/contains_test.cu index e53218933..0b3604528 100644 --- a/tests/static_map/contains_test.cu +++ b/tests/static_map/contains_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -130,7 +131,14 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu index 397af6a4e..9a3422f0b 100644 --- a/tests/static_map/duplicate_keys_test.cu +++ b/tests/static_map/duplicate_keys_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -52,7 +53,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{500'000}; diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 518b4fe14..8bf09abe2 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -101,7 +102,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000'000}; diff --git a/tests/static_map/find_test.cu b/tests/static_map/find_test.cu index d26c658aa..7e1289bb6 100644 --- a/tests/static_map/find_test.cu +++ b/tests/static_map/find_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -152,7 +153,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{301}; diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu index 3ce9d9f59..591ad83b9 100644 --- a/tests/static_map/for_each_test.cu +++ b/tests/static_map/for_each_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -99,7 +100,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{100}; using probe = std::conditional_t< diff --git a/tests/static_map/hash_test.cu b/tests/static_map/hash_test.cu index 8acf90b9c..b4f529418 100644 --- a/tests/static_map/hash_test.cu +++ b/tests/static_map/hash_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -62,7 +63,16 @@ void test_hash_function() REQUIRE(cuco::test::all_of(d_keys_exist.begin(), d_keys_exist.end(), cuda::std::identity{})); } -TEMPLATE_TEST_CASE_SIG("static_map hash tests", "", ((typename Key)), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_map hash tests", + "", + ((typename Key)), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { test_hash_function>(); test_hash_function>(); diff --git a/tests/static_map/heterogeneous_lookup_test.cu b/tests/static_map/heterogeneous_lookup_test.cu index b168d861d..3b8b0d023 100644 --- a/tests/static_map/heterogeneous_lookup_test.cu +++ b/tests/static_map/heterogeneous_lookup_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -88,6 +89,10 @@ TEMPLATE_TEST_CASE_SIG("static_map heterogeneous lookup tests", (int64_t, 1), (int64_t, 2), #endif +#if defined(CUCO_HAS_128BIT_ATOMICS) + (__int128_t, 1), + (__int128_t, 2), +#endif (int32_t, 1), (int32_t, 2)) diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index cc978ff5b..17665eb2b 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -17,6 +17,7 @@ #include +#include #include #include @@ -51,7 +52,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { #if !defined(CUCO_HAS_INDEPENDENT_THREADS) if constexpr (cuco::detail::is_packable>()) diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index e14200d5a..0a4d07ea3 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -160,7 +161,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{10'000}; constexpr size_type num_unique_keys{100}; @@ -201,8 +209,16 @@ TEMPLATE_TEST_CASE_SIG( } } -TEMPLATE_TEST_CASE_SIG( - "static_map insert_or_apply all unique keys tests", "", ((typename Key)), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_map insert_or_apply all unique keys tests", + "", + ((typename Key)), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { using Value = Key; diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 273fb6e2f..9d1f74491 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -92,7 +93,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_map/key_sentinel_test.cu b/tests/static_map/key_sentinel_test.cu index b58145896..edfa499b6 100644 --- a/tests/static_map/key_sentinel_test.cu +++ b/tests/static_map/key_sentinel_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -32,7 +33,16 @@ struct custom_equals { __device__ bool operator()(T lhs, T rhs) const { return A[lhs] == A[rhs]; } }; -TEMPLATE_TEST_CASE_SIG("static_map key sentinel tests", "", ((typename T), T), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_map key sentinel tests", + "", + ((typename T), T), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { using Key = T; using Value = T; diff --git a/tests/static_map/retrieve_if_test.cu b/tests/static_map/retrieve_if_test.cu index c91827e57..fe4441822 100644 --- a/tests/static_map/retrieve_if_test.cu +++ b/tests/static_map/retrieve_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -110,7 +111,12 @@ TEMPLATE_TEST_CASE_SIG("static_map retrieve_if", "", ((typename Key, typename T), Key, T), (int32_t, int32_t), - (int64_t, int64_t)) + (int64_t, int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_map/retrieve_test.cu b/tests/static_map/retrieve_test.cu index 1f2a50ccb..8a4cd2acb 100644 --- a/tests/static_map/retrieve_test.cu +++ b/tests/static_map/retrieve_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -119,7 +120,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; diff --git a/tests/static_map/shared_memory_test.cu b/tests/static_map/shared_memory_test.cu index cac1d08cd..15731319c 100644 --- a/tests/static_map/shared_memory_test.cu +++ b/tests/static_map/shared_memory_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -72,7 +73,12 @@ TEMPLATE_TEST_CASE_SIG("static_map shared memory tests", (int32_t, int32_t), (int32_t, int64_t), (int64_t, int32_t), - (int64_t, int64_t)) + (int64_t, int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t) +#endif +) { constexpr std::size_t number_of_maps = 1000; constexpr std::size_t elements_in_map = 500; diff --git a/tests/static_map/stream_test.cu b/tests/static_map/stream_test.cu index 2261b33b9..6c1701d98 100644 --- a/tests/static_map/stream_test.cu +++ b/tests/static_map/stream_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -34,7 +35,12 @@ TEMPLATE_TEST_CASE_SIG("static_map: unique sequence of keys on given stream", (int32_t, int32_t), (int32_t, int64_t), (int64_t, int32_t), - (int64_t, int64_t)) + (int64_t, int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t) +#endif +) { cudaStream_t stream; CUCO_CUDA_TRY(cudaStreamCreate(&stream)); diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu index aa389dc8d..9b105ae0f 100644 --- a/tests/static_multimap/count_test.cu +++ b/tests/static_multimap/count_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -82,7 +83,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; diff --git a/tests/static_multimap/find_test.cu b/tests/static_multimap/find_test.cu index 747bd47fe..b7c40b715 100644 --- a/tests/static_multimap/find_test.cu +++ b/tests/static_multimap/find_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -109,7 +110,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; diff --git a/tests/static_multimap/for_each_test.cu b/tests/static_multimap/for_each_test.cu index e6eddcc81..fa26f2c3d 100644 --- a/tests/static_multimap/for_each_test.cu +++ b/tests/static_multimap/for_each_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -109,7 +110,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_t num_unique_keys{400}; constexpr size_t key_multiplicity{5}; diff --git a/tests/static_multimap/heterogeneous_lookup_test.cu b/tests/static_multimap/heterogeneous_lookup_test.cu index e99ef6384..97a382d96 100644 --- a/tests/static_multimap/heterogeneous_lookup_test.cu +++ b/tests/static_multimap/heterogeneous_lookup_test.cu @@ -78,6 +78,9 @@ struct custom_key_equal { } }; +// TODO: extend with __int128_t once the multimap can handle slots larger than +// 32 bytes (key_pair<__int128_t> is 32 bytes, so pair, V> +// exceeds the current slot-size budget). TEMPLATE_TEST_CASE( "static_multimap heterogeneous lookup tests", "", diff --git a/tests/static_multimap/insert_contains_test.cu b/tests/static_multimap/insert_contains_test.cu index a149e0f0b..ecd202dc3 100644 --- a/tests/static_multimap/insert_contains_test.cu +++ b/tests/static_multimap/insert_contains_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -95,7 +96,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{4'000}; diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 038dbee64..53aee9a0f 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -86,7 +87,14 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{1'000}; diff --git a/tests/static_multimap/multiplicity_test.cu b/tests/static_multimap/multiplicity_test.cu index 58104781b..e8677b300 100644 --- a/tests/static_multimap/multiplicity_test.cu +++ b/tests/static_multimap/multiplicity_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -91,7 +92,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 8), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 8)) + (int64_t, cuco::test::probe_sequence::linear_probing, 8) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 8), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 8) +#endif +) { constexpr std::size_t num_items{400}; diff --git a/tests/static_multimap/retrieve_if_test.cu b/tests/static_multimap/retrieve_if_test.cu index a1918fbaf..97236d4cb 100644 --- a/tests/static_multimap/retrieve_if_test.cu +++ b/tests/static_multimap/retrieve_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -110,7 +111,12 @@ TEMPLATE_TEST_CASE_SIG("static_multimap retrieve_if", "", ((typename Key, typename Value), Key, Value), (int32_t, int32_t), - (int64_t, int64_t)) + (int64_t, int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, __int128_t) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_multimap/retrieve_test.cu b/tests/static_multimap/retrieve_test.cu index a5ff9085e..21e913a10 100644 --- a/tests/static_multimap/retrieve_test.cu +++ b/tests/static_multimap/retrieve_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -108,7 +109,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 4), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 4)) + (int64_t, cuco::test::probe_sequence::linear_probing, 4) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 4), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 4) +#endif +) { constexpr std::size_t num_items{1'000}; diff --git a/tests/static_multiset/contains_test.cu b/tests/static_multiset/contains_test.cu index 82397fec2..1a854c9bd 100644 --- a/tests/static_multiset/contains_test.cu +++ b/tests/static_multiset/contains_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -87,7 +88,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_multiset/count_test.cu b/tests/static_multiset/count_test.cu index 797828e26..84d9f4aec 100644 --- a/tests/static_multiset/count_test.cu +++ b/tests/static_multiset/count_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -189,7 +190,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{666}; diff --git a/tests/static_multiset/custom_count_test.cu b/tests/static_multiset/custom_count_test.cu index 2cded9396..6ef2278b9 100644 --- a/tests/static_multiset/custom_count_test.cu +++ b/tests/static_multiset/custom_count_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -129,7 +130,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{555}; diff --git a/tests/static_multiset/find_test.cu b/tests/static_multiset/find_test.cu index ecb763017..b70919859 100644 --- a/tests/static_multiset/find_test.cu +++ b/tests/static_multiset/find_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -100,7 +101,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; diff --git a/tests/static_multiset/for_each_test.cu b/tests/static_multiset/for_each_test.cu index 43b22eee9..5421df342 100644 --- a/tests/static_multiset/for_each_test.cu +++ b/tests/static_multiset/for_each_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -102,7 +103,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_t num_unique_keys{400}; constexpr size_t key_multiplicity{5}; diff --git a/tests/static_multiset/insert_test.cu b/tests/static_multiset/insert_test.cu index 33cd966a0..114d4b4e2 100644 --- a/tests/static_multiset/insert_test.cu +++ b/tests/static_multiset/insert_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -85,7 +86,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_multiset/large_input_test.cu b/tests/static_multiset/large_input_test.cu index 37e2dedcb..20cd43217 100644 --- a/tests/static_multiset/large_input_test.cu +++ b/tests/static_multiset/large_input_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -56,9 +57,17 @@ TEMPLATE_TEST_CASE_SIG( "", ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), (int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int64_t, cuco::test::probe_sequence::double_hashing, 2)) + (int64_t, cuco::test::probe_sequence::double_hashing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2) +#endif +) { - constexpr std::size_t num_keys{1'200'000'000}; + // Reduce the key count for 16-byte keys to stay within GPU memory. + // 1.2B * 8B * 2 (capacity) = 19.2GB; 300M * 16B * 2 = 9.6GB. + constexpr std::size_t num_keys = (sizeof(Key) >= 16) ? 300'000'000 : 1'200'000'000; using extent_type = cuco::extent; using probe = cuco::double_hashing>; diff --git a/tests/static_multiset/load_factor_test.cu b/tests/static_multiset/load_factor_test.cu index 4bfbe8a14..154bbf06d 100644 --- a/tests/static_multiset/load_factor_test.cu +++ b/tests/static_multiset/load_factor_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -33,7 +34,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{10}; diff --git a/tests/static_multiset/retrieve_if_test.cu b/tests/static_multiset/retrieve_if_test.cu index 19defd557..d9ad964fa 100644 --- a/tests/static_multiset/retrieve_if_test.cu +++ b/tests/static_multiset/retrieve_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -106,8 +107,16 @@ __global__ void test_retrieve_if_all_true_kernel( *atomic_counter); } -TEMPLATE_TEST_CASE_SIG( - "static_multiset retrieve_if", "", ((typename Key), Key), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_multiset retrieve_if", + "", + ((typename Key), Key), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { constexpr size_type num_keys{400}; diff --git a/tests/static_multiset/retrieve_test.cu b/tests/static_multiset/retrieve_test.cu index acac841df..3986928f5 100644 --- a/tests/static_multiset/retrieve_test.cu +++ b/tests/static_multiset/retrieve_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -140,7 +141,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{400}; constexpr double desired_load_factor = 0.5; diff --git a/tests/static_multiset/stream_test.cu b/tests/static_multiset/stream_test.cu index 09ceb88f3..1264eb21d 100644 --- a/tests/static_multiset/stream_test.cu +++ b/tests/static_multiset/stream_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -33,7 +34,12 @@ TEMPLATE_TEST_CASE_SIG("static_multiset: operations on different stream than con "", ((typename Key), Key), (int32_t), - (int64_t)) + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { cudaStream_t constructor_stream; cudaStream_t operation_stream; diff --git a/tests/static_set/for_each_test.cu b/tests/static_set/for_each_test.cu index 7db5d0d5f..5374f8fac 100644 --- a/tests/static_set/for_each_test.cu +++ b/tests/static_set/for_each_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -81,7 +82,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{1'000}; using probe = std::conditional_t< @@ -97,6 +106,6 @@ TEMPLATE_TEST_CASE_SIG( cuco::cuda_allocator, cuco::storage<2>>; - auto set = set_t{num_keys, cuco::empty_key{-1}}; + auto set = set_t{num_keys, cuco::empty_key{static_cast(-1)}}; test_for_each(set, num_keys); } diff --git a/tests/static_set/heterogeneous_lookup_test.cu b/tests/static_set/heterogeneous_lookup_test.cu index 57a1b291a..10e471ffe 100644 --- a/tests/static_set/heterogeneous_lookup_test.cu +++ b/tests/static_set/heterogeneous_lookup_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -84,7 +85,13 @@ TEMPLATE_TEST_CASE_SIG("static_set heterogeneous lookup tests", "", ((typename T, int CGSize), T, CGSize), (int32_t, 1), - (int32_t, 2)) + (int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, 1), + (__int128_t, 2) +#endif +) { using Key = T; using InsertKey = key_pair; diff --git a/tests/static_set/insert_and_find_test.cu b/tests/static_set/insert_and_find_test.cu index f01ae8884..7eefdfb6c 100644 --- a/tests/static_set/insert_and_find_test.cu +++ b/tests/static_set/insert_and_find_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -62,7 +63,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{400}; @@ -70,8 +79,8 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing>>; - auto set = - cuco::static_set{num_keys, cuco::empty_key{-1}, {}, probe{}, {}, cuco::storage<2>{}}; + auto set = cuco::static_set{ + num_keys, cuco::empty_key{static_cast(-1)}, {}, probe{}, {}, cuco::storage<2>{}}; test_insert_and_find(set, num_keys); } diff --git a/tests/static_set/large_input_test.cu b/tests/static_set/large_input_test.cu index 67daeafc6..b3096ee5f 100644 --- a/tests/static_set/large_input_test.cu +++ b/tests/static_set/large_input_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -74,9 +75,17 @@ TEMPLATE_TEST_CASE_SIG( "", ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), (int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int64_t, cuco::test::probe_sequence::double_hashing, 2)) + (int64_t, cuco::test::probe_sequence::double_hashing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2) +#endif +) { - constexpr std::size_t num_keys{1'200'000'000}; + // Reduce the key count for 16-byte keys to stay within GPU memory. + // 1.2B * 8B * 2 (capacity) = 19.2GB; 300M * 16B * 2 = 9.6GB. + constexpr std::size_t num_keys = (sizeof(Key) >= 16) ? 300'000'000 : 1'200'000'000; using extent_type = cuco::extent; using probe = cuco::double_hashing>; diff --git a/tests/static_set/retrieve_all_test.cu b/tests/static_set/retrieve_all_test.cu index 0bb554727..ed2e70abb 100644 --- a/tests/static_set/retrieve_all_test.cu +++ b/tests/static_set/retrieve_all_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -67,7 +68,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{400}; constexpr double desired_load_factor = 1.; @@ -85,7 +94,8 @@ TEMPLATE_TEST_CASE_SIG( } }(); - auto set = cuco::static_set{num_keys, desired_load_factor, cuco::empty_key{-1}, {}, probe{}}; + auto set = cuco::static_set{ + num_keys, desired_load_factor, cuco::empty_key{static_cast(-1)}, {}, probe{}}; REQUIRE(set.capacity() == gold_capacity); diff --git a/tests/static_set/retrieve_if_test.cu b/tests/static_set/retrieve_if_test.cu index 129fe68af..b82969926 100644 --- a/tests/static_set/retrieve_if_test.cu +++ b/tests/static_set/retrieve_if_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -106,13 +107,22 @@ __global__ void test_retrieve_if_all_true_kernel( *atomic_counter); } -TEMPLATE_TEST_CASE_SIG("static_set retrieve_if", "", ((typename Key), Key), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_set retrieve_if", + "", + ((typename Key), Key), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { constexpr size_type num_keys{400}; using container_type = cuco::static_set; - container_type container{num_keys * 2, cuco::empty_key{-1}}; + container_type container{num_keys * 2, cuco::empty_key{static_cast(-1)}}; auto keys_begin = cuda::counting_iterator(1); auto keys_end = keys_begin + num_keys; diff --git a/tests/static_set/retrieve_test.cu b/tests/static_set/retrieve_test.cu index edbcb4c20..f5b7763dc 100644 --- a/tests/static_set/retrieve_test.cu +++ b/tests/static_set/retrieve_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -81,7 +82,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr std::size_t num_keys{400}; constexpr double desired_load_factor = 1.; @@ -90,8 +99,11 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing>>; - auto set = cuco::static_set{ - num_keys, desired_load_factor, cuco::empty_key{key_sentinel}, {}, probe{}}; + auto set = cuco::static_set{num_keys, + desired_load_factor, + cuco::empty_key{static_cast(key_sentinel)}, + {}, + probe{}}; test_unique_sequence(set, num_keys); } diff --git a/tests/static_set/shared_memory_test.cu b/tests/static_set/shared_memory_test.cu index 374271e9a..08f805927 100644 --- a/tests/static_set/shared_memory_test.cu +++ b/tests/static_set/shared_memory_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -63,8 +64,16 @@ __global__ void shared_memory_test_kernel(Ref* sets, } } -TEMPLATE_TEST_CASE_SIG( - "static_set shared memory tests", "", ((typename Key), Key), (int32_t), (int64_t)) +TEMPLATE_TEST_CASE_SIG("static_set shared memory tests", + "", + ((typename Key), Key), + (int32_t), + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { constexpr std::size_t number_of_sets = 1000; constexpr std::size_t elements_in_set = 500; diff --git a/tests/static_set/stream_test.cu b/tests/static_set/stream_test.cu index 47e96283f..f9277ed8f 100644 --- a/tests/static_set/stream_test.cu +++ b/tests/static_set/stream_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -33,7 +34,12 @@ TEMPLATE_TEST_CASE_SIG("static_set: operations on different stream than construc "", ((typename Key), Key), (int32_t), - (int64_t)) + (int64_t) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t) +#endif +) { cudaStream_t constructor_stream; cudaStream_t operation_stream; @@ -43,7 +49,7 @@ TEMPLATE_TEST_CASE_SIG("static_set: operations on different stream than construc { // Scope ensures set is destroyed before streams constexpr std::size_t num_keys{500'000}; auto set = cuco::static_set{num_keys * 2, - cuco::empty_key{-1}, + cuco::empty_key{static_cast(-1)}, {}, cuco::linear_probing<1, cuco::default_hash_function>{}, {}, diff --git a/tests/static_set/unique_sequence_test.cu b/tests/static_set/unique_sequence_test.cu index 0da6539d5..2db91c2b6 100644 --- a/tests/static_set/unique_sequence_test.cu +++ b/tests/static_set/unique_sequence_test.cu @@ -16,6 +16,7 @@ #include +#include #include #include @@ -147,7 +148,15 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, cuco::test::probe_sequence::linear_probing, 2)) + (int64_t, cuco::test::probe_sequence::linear_probing, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (__int128_t, cuco::test::probe_sequence::double_hashing, 1), + (__int128_t, cuco::test::probe_sequence::double_hashing, 2), + (__int128_t, cuco::test::probe_sequence::linear_probing, 1), + (__int128_t, cuco::test::probe_sequence::linear_probing, 2) +#endif +) { constexpr size_type num_keys{400}; using probe = std::conditional_t{SENTINEL}, {}, probe{}, {}, cuco::storage<2>{}}; + auto set = cuco::static_set{num_keys, + cuco::empty_key{static_cast(SENTINEL)}, + {}, + probe{}, + {}, + cuco::storage<2>{}}; REQUIRE(set.capacity() == gold_capacity);