Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)

set(rapids-cmake-version 26.04)
set(rapids-cmake-branch "release/${rapids-cmake-version}")
if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
file(DOWNLOAD
https://raw.githubusercontent.com/rapidsai/rapids-cmake/release/${rapids-cmake-version}/RAPIDS.cmake
Expand Down Expand Up @@ -74,7 +75,7 @@ rapids_find_package(
###################################################################################################
# - find packages we depend on --------------------------------------------------------------------

rapids_cpm_init()
rapids_cpm_init(OVERRIDE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cccl_override.json")

include(cmake/thirdparty/get_cccl.cmake)

Expand Down
14 changes: 10 additions & 4 deletions benchmarks/benchmark_defaults.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cuco/detail/__config>
#include <cuco/hash_functions.cuh>

#include <nvbench/nvbench.cuh>
Expand All @@ -25,12 +26,17 @@

namespace cuco::benchmark::defaults {

#if defined(CUCO_HAS_128BIT_ATOMICS)
using KEY_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
#else
using KEY_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
using HASH_RANGE = nvbench::type_list<cuco::identity_hash<char>,
cuco::xxhash_32<char>,
cuco::xxhash_64<char>,
cuco::murmurhash3_32<char>>; //,
#endif
using HASH_RANGE = nvbench::type_list<cuco::identity_hash<char>,
cuco::xxhash_32<char>,
cuco::xxhash_64<char>,
cuco::murmurhash3_32<char>>; //,
// cuco::murmurhash3_x86_128<char>,
// cuco::murmurhash3_x64_128<char>>; // TODO handle tuple-like hash value

Expand Down
5 changes: 5 additions & 0 deletions benchmarks/benchmark_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#pragma once

#include <cuco/detail/__config>
#include <cuco/detail/error.hpp>
#include <cuco/utility/key_generator.cuh>

Expand Down Expand Up @@ -92,3 +93,7 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform,
NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian,
"GAUSSIAN",
"distribution::gaussian");

#if defined(CUCO_HAS_128BIT_ATOMICS)
NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "__int128_t");
#endif
2 changes: 1 addition & 1 deletion ci/matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ devcontainer_version: '26.04'
pull_request:
nvcc:
- {cuda: *cuda_oldest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'g++'}, gpu_build_archs: '70,80', std: [17], jobs: ['build', 'test']}
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build', 'test']}
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '90,100', std: [17], jobs: ['build', 'test']}
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'arm64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build']}
- {cuda: *cuda_oldest, os: 'ubuntu20.04', cpu: 'amd64', compiler: {name: 'llvm', version: '14', exe: 'clang++'}, gpu_build_archs: '70', std: [17], jobs: ['build']}
- {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'llvm', version: '21', exe: 'clang++'}, gpu_build_archs: '100', std: [17], jobs: ['build']}
Expand Down
17 changes: 17 additions & 0 deletions cmake/cccl_override.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"packages": {
"CCCL": {
"version": "3.3.0",
"git_url": "https://github.com/NVIDIA/cccl.git",
"git_tag": "09094af138841ef521de1adbbdd18ab8b3dad47b",
"git_shallow": false,
"patches": [
{
"file": "${current_json_dir}/patches/cccl_fix_128bit_cas.patch",
"issue": "Fix 128-bit atomic CAS operand indices [https://github.com/NVIDIA/cccl/issues/8402]",
"fixed_in": "3.3.2"
}
]
}
}
}
238 changes: 238 additions & 0 deletions cmake/patches/cccl_fix_128bit_cas.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
From 1898944000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <sleeepyjack@users.noreply.github.com>
Date: Mon, 14 Apr 2026 00:00:00 +0000
Subject: [PATCH] Fix codegen in 128bit atomic CAS (#8403)

Fix wrong inline asm operand indices in all atom.cas.*.b128 variants.
See https://github.com/NVIDIA/cccl/issues/8402
---
.../cuda/std/__atomic/functions/cuda_ptx_generated.h | 80 ++++++++++----------
1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
index f3e30d53039..479815f4136 100644
--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
@@ -1585,8 +1585,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.acquire.cta.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1604,8 +1604,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.acquire.cluster.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1623,8 +1623,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.acquire.gpu.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1642,8 +1642,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.acquire.sys.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1661,8 +1661,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.relaxed.cta.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1680,8 +1680,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1699,8 +1699,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1718,8 +1718,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.relaxed.sys.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1737,8 +1737,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.release.cta.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1756,8 +1756,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.release.cluster.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1775,8 +1775,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.release.gpu.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1794,8 +1794,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.release.sys.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1813,8 +1813,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1832,8 +1832,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1851,8 +1851,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1870,8 +1870,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1889,8 +1889,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.cta.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1908,8 +1908,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.cluster.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1927,8 +1927,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.gpu.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
@@ -1946,8 +1946,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
{
.reg .b128 _d;
.reg .b128 _v;
- mov.b128 _d, {%0, %1};
- mov.b128 _v, {%4, %5};
+ mov.b128 _d, {%3, %4};
+ mov.b128 _v, {%5, %6};
atom.cas.sys.b128 _d,[%2],_d,_v;
mov.b128 {%0, %1}, _d;
}
--
2.45.2

37 changes: 37 additions & 0 deletions include/cuco/detail/__config
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <nv/target>
#include <cuda/std/version>

#include <cstddef>

#if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__)
#error "NVCC version not found"
#elif __CUDACC_VER_MAJOR__ < 12
Expand Down Expand Up @@ -52,6 +54,41 @@
#define CUCO_HAS_INT128
#endif

#if defined(CUCO_HAS_INT128) && (CUCO_CUDA_MINIMUM_ARCH >= 900)
#define CUCO_HAS_128BIT_ATOMICS
#endif

#if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
#define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC
#endif

namespace cuco::detail {

/// Maximum supported key size (in bytes) for open-addressing containers.
inline constexpr std::size_t max_key_size =
#if defined(CUCO_HAS_128BIT_ATOMICS)
16;
#else
8;
#endif

/// Maximum supported payload/mapped type size (in bytes) for open-addressing containers.
/// Tied to `max_key_size`: a slot stores at most a key plus an equally-sized payload.
inline constexpr std::size_t max_payload_size = max_key_size;

/// Maximum supported slot size (in bytes) for open-addressing containers.
/// Tied to `max_key_size`: a slot stores at most a key plus an equally-sized payload
/// (i.e., `sizeof(pair<Key, Value>) <= 2 * max_key_size`).
inline constexpr std::size_t max_slot_size = 2 * max_key_size;

/// Checks if the given size is a valid mapped_type size for packed CAS operations.
inline constexpr bool is_valid_mapped_size(std::size_t n)
{
return n == 4 or n == 8
#if defined(CUCO_HAS_128BIT_ATOMICS)
or n == 16
#endif
;
}

} // namespace cuco::detail
Loading
Loading