NVIDIA · sleeepyjack · Apr 14, 2026 · Apr 15, 2026 · Apr 17, 2026 · Apr 17, 2026
@@ -16,6 +16,7 @@
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
 set(rapids-cmake-version 26.04)
+set(rapids-cmake-branch "release/${rapids-cmake-version}")
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUCO_RAPIDS.cmake)
     file(DOWNLOAD
       https://raw.githubusercontent.com/rapidsai/rapids-cmake/release/${rapids-cmake-version}/RAPIDS.cmake
@@ -74,7 +75,7 @@ rapids_find_package(
 ###################################################################################################
 # - find packages we depend on --------------------------------------------------------------------
 
-rapids_cpm_init()
+rapids_cpm_init(OVERRIDE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cccl_override.json")
 
 include(cmake/thirdparty/get_cccl.cmake)
 

@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuco/detail/__config>
 #include <cuco/hash_functions.cuh>
 
 #include <nvbench/nvbench.cuh>
@@ -25,12 +26,17 @@
 
 namespace cuco::benchmark::defaults {
 
+#if defined(CUCO_HAS_128BIT_ATOMICS)
+using KEY_TYPE_RANGE   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
+using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
+#else
 using KEY_TYPE_RANGE   = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
 using VALUE_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
-using HASH_RANGE       = nvbench::type_list<cuco::identity_hash<char>,
-                                            cuco::xxhash_32<char>,
-                                            cuco::xxhash_64<char>,
-                                            cuco::murmurhash3_32<char>>;  //,
+#endif
+using HASH_RANGE = nvbench::type_list<cuco::identity_hash<char>,
+                                      cuco::xxhash_32<char>,
+                                      cuco::xxhash_64<char>,
+                                      cuco::murmurhash3_32<char>>;  //,
 // cuco::murmurhash3_x86_128<char>,
 // cuco::murmurhash3_x64_128<char>>; // TODO handle tuple-like hash value
 

@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuco/detail/__config>
 #include <cuco/detail/error.hpp>
 #include <cuco/utility/key_generator.cuh>
 
@@ -92,3 +93,7 @@ NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::uniform,
 NVBENCH_DECLARE_TYPE_STRINGS(cuco::utility::distribution::gaussian,
                              "GAUSSIAN",
                              "distribution::gaussian");
+
+#if defined(CUCO_HAS_128BIT_ATOMICS)
+NVBENCH_DECLARE_TYPE_STRINGS(__int128_t, "I128", "__int128_t");
+#endif
@@ -37,7 +37,7 @@ devcontainer_version: '26.04'
 pull_request:
   nvcc:
     - {cuda: *cuda_oldest, os: 'ubuntu22.04', cpu: 'amd64', compiler: {name: 'gcc', version: '11', exe: 'g++'}, gpu_build_archs: '70,80', std: [17], jobs: ['build', 'test']}
-    - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build', 'test']}
+    - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '90,100', std: [17], jobs: ['build', 'test']}
     - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'arm64', compiler: {name: 'gcc', version: '14', exe: 'g++'}, gpu_build_archs: '80,90,100', std: [17], jobs: ['build']}
     - {cuda: *cuda_oldest, os: 'ubuntu20.04', cpu: 'amd64', compiler: {name: 'llvm', version: '14', exe: 'clang++'}, gpu_build_archs: '70', std: [17], jobs: ['build']}
     - {cuda: *cuda_newest, os: 'ubuntu24.04', cpu: 'amd64', compiler: {name: 'llvm', version: '21', exe: 'clang++'}, gpu_build_archs: '100', std: [17], jobs: ['build']}

@@ -0,0 +1,17 @@
+{
+  "packages": {
+    "CCCL": {
+      "version": "3.3.0",
+      "git_url": "https://github.com/NVIDIA/cccl.git",
+      "git_tag": "09094af138841ef521de1adbbdd18ab8b3dad47b",
+      "git_shallow": false,
+      "patches": [
+        {
+          "file": "${current_json_dir}/patches/cccl_fix_128bit_cas.patch",
+          "issue": "Fix 128-bit atomic CAS operand indices [https://github.com/NVIDIA/cccl/issues/8402]",
+          "fixed_in": "3.3.2"
+        }
+      ]
+    }
+  }
+}
@@ -0,0 +1,238 @@
+From 1898944000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Daniel Juenger <sleeepyjack@users.noreply.github.com>
+Date: Mon, 14 Apr 2026 00:00:00 +0000
+Subject: [PATCH] Fix codegen in 128bit atomic CAS (#8403)
+
+Fix wrong inline asm operand indices in all atom.cas.*.b128 variants.
+See https://github.com/NVIDIA/cccl/issues/8402
+---
+ .../cuda/std/__atomic/functions/cuda_ptx_generated.h | 80 ++++++++++----------
+ 1 file changed, 40 insertions(+), 40 deletions(-)
+
+diff --git a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+index f3e30d53039..479815f4136 100644
+--- a/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
++++ b/libcudacxx/include/cuda/std/__atomic/functions/cuda_ptx_generated.h
+@@ -1585,8 +1585,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.acquire.cta.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1604,8 +1604,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.acquire.cluster.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1623,8 +1623,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.acquire.gpu.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1642,8 +1642,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.acquire.sys.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1661,8 +1661,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.relaxed.cta.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1680,8 +1680,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.relaxed.cluster.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1699,8 +1699,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.relaxed.gpu.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1718,8 +1718,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.relaxed.sys.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1737,8 +1737,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.release.cta.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1756,8 +1756,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.release.cluster.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1775,8 +1775,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.release.gpu.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1794,8 +1794,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.release.sys.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1813,8 +1813,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.acq_rel.cta.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1832,8 +1832,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.acq_rel.cluster.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1851,8 +1851,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.acq_rel.gpu.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1870,8 +1870,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.acq_rel.sys.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1889,8 +1889,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.cta.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1908,8 +1908,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.cluster.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1927,8 +1927,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.gpu.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+@@ -1946,8 +1946,8 @@ static inline _CCCL_DEVICE bool __cuda_atomic_compare_exchange(
+     {
+       .reg .b128 _d;
+       .reg .b128 _v;
+-      mov.b128 _d, {%0, %1};
+-      mov.b128 _v, {%4, %5};
++      mov.b128 _d, {%3, %4};
++      mov.b128 _v, {%5, %6};
+       atom.cas.sys.b128 _d,[%2],_d,_v;
+       mov.b128 {%0, %1}, _d;
+     }
+--
+2.45.2
+
@@ -19,6 +19,8 @@
 #include <nv/target>
 #include <cuda/std/version>
 
+#include <cstddef>
+
 #if !defined(__CUDACC_VER_MAJOR__) || !defined(__CUDACC_VER_MINOR__)
 #error "NVCC version not found"
 #elif __CUDACC_VER_MAJOR__ < 12 
@@ -52,6 +54,41 @@
 #define CUCO_HAS_INT128
 #endif
 
+#if defined(CUCO_HAS_INT128) && (CUCO_CUDA_MINIMUM_ARCH >= 900)
+#define CUCO_HAS_128BIT_ATOMICS
+#endif
+
 #if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
 #define CUCO_HAS_CG_REDUCE_UPDATE_ASYNC
 #endif
+
+namespace cuco::detail {
+
+/// Maximum supported key size (in bytes) for open-addressing containers.
+inline constexpr std::size_t max_key_size =
+#if defined(CUCO_HAS_128BIT_ATOMICS)
+  16;
+#else
+  8;
+#endif
+
+/// Maximum supported payload/mapped type size (in bytes) for open-addressing containers.
+/// Tied to `max_key_size`: a slot stores at most a key plus an equally-sized payload.
+inline constexpr std::size_t max_payload_size = max_key_size;
+
+/// Maximum supported slot size (in bytes) for open-addressing containers.
+/// Tied to `max_key_size`: a slot stores at most a key plus an equally-sized payload
+/// (i.e., `sizeof(pair<Key, Value>) <= 2 * max_key_size`).
+inline constexpr std::size_t max_slot_size = 2 * max_key_size;
+
+/// Checks if the given size is a valid mapped_type size for packed CAS operations.
+inline constexpr bool is_valid_mapped_size(std::size_t n)
+{
+  return n == 4 or n == 8
+#if defined(CUCO_HAS_128BIT_ATOMICS)
+    or n == 16
+#endif
+  ;
+}
+
+}  // namespace cuco::detail