diff --git a/architecture/kubernetes-user-namespaces-ocp-testing.md b/architecture/kubernetes-user-namespaces-ocp-testing.md new file mode 100644 index 000000000..15d86bb64 --- /dev/null +++ b/architecture/kubernetes-user-namespaces-ocp-testing.md @@ -0,0 +1,364 @@ +# Testing User Namespaces on OCP + +Step-by-step guide to deploy OpenShell with user namespace isolation on an OpenShift cluster and verify end-to-end functionality. + +## Prerequisites + +- An OCP cluster (tested on OCP 4.22 / K8s 1.35.3 / CRI-O 1.35 / RHEL CoreOS / kernel 5.14) +- `kubectl` and `helm` on your `PATH` +- `podman` for building and pushing images +- `KUBECONFIG` set to point at the cluster +- The OpenShell repo checked out with the user namespace branch built + +## 1. Build binaries + +```shell +cargo build -p openshell-server --features openshell-core/dev-settings +cargo build -p openshell-sandbox --features openshell-core/dev-settings +cargo build -p openshell-cli --features openshell-core/dev-settings +``` + +## 2. Create namespace and install the Sandbox CRD + +```shell +kubectl create ns openshell +kubectl apply -f deploy/kube/manifests/agent-sandbox.yaml +``` + +Label the namespace to allow privileged pods: + +```shell +kubectl label ns openshell pod-security.kubernetes.io/enforce=privileged --overwrite +kubectl label ns openshell pod-security.kubernetes.io/warn=privileged --overwrite +``` + +## 3. Grant SCCs + +The gateway pod needs `anyuid` (runs as UID 1000) and sandbox pods need `privileged` (capabilities for supervisor): + +```shell +kubectl create clusterrolebinding openshell-sa-anyuid \ + --clusterrole=system:openshift:scc:anyuid \ + --serviceaccount=openshell:openshell + +kubectl create clusterrolebinding openshell-sa-privileged \ + --clusterrole=system:openshift:scc:privileged \ + --serviceaccount=openshell:openshell + +kubectl create clusterrolebinding openshell-default-privileged \ + --clusterrole=system:openshift:scc:privileged \ + --serviceaccount=openshell:default +``` + +Grant the sandbox CRD controller full permissions (it needs to set ownerReferences with blockOwnerDeletion): + +```shell +kubectl create clusterrolebinding agent-sandbox-admin \ + --clusterrole=cluster-admin \ + --serviceaccount=agent-sandbox-system:agent-sandbox-controller +``` + +## 4. Generate TLS certificates + +```shell +TLSDIR=$(mktemp -d) + +# CA +openssl req -x509 -newkey rsa:2048 -nodes \ + -keyout $TLSDIR/ca.key -out $TLSDIR/ca.crt \ + -days 365 -subj "/CN=openshell-ca" 2>/dev/null + +# Server cert +openssl req -newkey rsa:2048 -nodes \ + -keyout $TLSDIR/server.key -out $TLSDIR/server.csr \ + -subj "/CN=openshell.openshell.svc.cluster.local" \ + -addext "subjectAltName=DNS:openshell.openshell.svc.cluster.local,DNS:openshell,DNS:localhost,IP:127.0.0.1" 2>/dev/null + +openssl x509 -req -in $TLSDIR/server.csr \ + -CA $TLSDIR/ca.crt -CAkey $TLSDIR/ca.key -CAcreateserial \ + -out $TLSDIR/server.crt -days 365 \ + -extfile <(echo "subjectAltName=DNS:openshell.openshell.svc.cluster.local,DNS:openshell,DNS:localhost,IP:127.0.0.1") 2>/dev/null + +# Client cert +openssl req -newkey rsa:2048 -nodes \ + -keyout $TLSDIR/client.key -out $TLSDIR/client.csr \ + -subj "/CN=openshell-client" 2>/dev/null + +openssl x509 -req -in $TLSDIR/client.csr \ + -CA $TLSDIR/ca.crt -CAkey $TLSDIR/ca.key -CAcreateserial \ + -out $TLSDIR/client.crt -days 365 2>/dev/null +``` + +Create Kubernetes secrets: + +```shell +kubectl create secret tls openshell-server-tls -n openshell \ + --cert=$TLSDIR/server.crt --key=$TLSDIR/server.key + +kubectl create secret generic openshell-server-client-ca -n openshell \ + --from-file=ca.crt=$TLSDIR/ca.crt + +kubectl create secret generic openshell-client-tls -n openshell \ + --from-file=ca.crt=$TLSDIR/ca.crt \ + --from-file=tls.crt=$TLSDIR/client.crt \ + --from-file=tls.key=$TLSDIR/client.key + +kubectl create secret generic openshell-ssh-handshake -n openshell \ + --from-literal=secret=$(openssl rand -hex 32) +``` + +Note: the `openshell-client-tls` secret must include `ca.crt`, `tls.crt`, and `tls.key` (not a `kubernetes.io/tls` type secret, which only has `tls.crt` and `tls.key`). + +## 5. Expose the OCP internal registry and push images + +```shell +# Enable the default route for the internal registry +kubectl patch configs.imageregistry.operator.openshift.io/cluster \ + --type merge -p '{"spec":{"defaultRoute":true}}' + +sleep 5 +REGISTRY=$(kubectl get route default-route -n openshift-image-registry -o jsonpath='{.spec.host}') +TOKEN=$(kubectl create token builder -n openshell) + +podman login --tls-verify=false -u kubeadmin -p "$TOKEN" "$REGISTRY" +``` + +Build and push the gateway image: + +```shell +podman build -f deploy/docker/Dockerfile.images --target gateway \ + -t localhost/openshell/gateway:dev . + +podman tag localhost/openshell/gateway:dev $REGISTRY/openshell/gateway:dev +podman push --tls-verify=false $REGISTRY/openshell/gateway:dev +``` + +Pull and push the sandbox base image: + +```shell +podman pull ghcr.io/nvidia/openshell-community/sandboxes/base:latest + +podman tag ghcr.io/nvidia/openshell-community/sandboxes/base:latest \ + $REGISTRY/openshell/sandbox-base:latest +podman push --tls-verify=false $REGISTRY/openshell/sandbox-base:latest +``` + +## 6. Install the supervisor binary on cluster nodes + +The sandbox supervisor binary is mounted into pods via a hostPath volume at `/opt/openshell/bin/`. A DaemonSet distributes it to every node with the correct SELinux label. + +Build and push a minimal image containing the supervisor binary: + +```shell +cp target/debug/openshell-sandbox /tmp/openshell-sandbox + +cat > /tmp/Dockerfile.supervisor <<'EOF' +FROM registry.access.redhat.com/ubi9/ubi-minimal:latest +COPY openshell-sandbox /openshell-sandbox +RUN chmod 755 /openshell-sandbox +EOF + +podman build -f /tmp/Dockerfile.supervisor -t localhost/openshell/supervisor:dev /tmp/ +podman tag localhost/openshell/supervisor:dev $REGISTRY/openshell/supervisor:dev +podman push --tls-verify=false $REGISTRY/openshell/supervisor:dev +``` + +Deploy the installer DaemonSet: + +```shell +INTERNAL_REG="image-registry.openshift-image-registry.svc:5000" + +cat </tmp/pf.log 2>&1 & +``` + +Set up the CLI gateway configuration with mTLS: + +```shell +mkdir -p ~/.config/openshell/gateways/ocp-userns/mtls + +cp $TLSDIR/ca.crt ~/.config/openshell/gateways/ocp-userns/mtls/ +cp $TLSDIR/client.crt ~/.config/openshell/gateways/ocp-userns/mtls/tls.crt +cp $TLSDIR/client.key ~/.config/openshell/gateways/ocp-userns/mtls/tls.key + +cat > ~/.config/openshell/gateways/ocp-userns/metadata.json <<'EOF' +{ + "name": "ocp-userns", + "gateway_endpoint": "https://127.0.0.1:18443", + "is_remote": false, + "gateway_port": 18443, + "auth_mode": "mtls" +} +EOF +``` + +Verify connectivity: + +```shell +OPENSHELL_GATEWAY=ocp-userns target/debug/openshell status +``` + +Expected output: + +``` +Server Status + Gateway: ocp-userns + Server: https://127.0.0.1:18443 + Status: Connected +``` + +## 9. Create a sandbox and verify user namespaces + +```shell +export OPENSHELL_GATEWAY=ocp-userns + +target/debug/openshell sandbox create --no-bootstrap -- sh -lc \ + "echo '=== uid_map ==='; cat /proc/self/uid_map; \ + echo '=== gid_map ==='; cat /proc/self/gid_map; \ + echo '=== id ==='; id; \ + echo '=== userns-e2e-ok ==='" +``` + +Expected output (UID values will vary): + +``` +=== uid_map === + 0 3285581824 65536 +=== gid_map === + 0 3285581824 65536 +=== id === +uid=998(sandbox) gid=998(sandbox) groups=998(sandbox) +=== userns-e2e-ok === +``` + +This confirms: + +- UID 0 inside the container maps to a high host UID (non-identity mapping) +- The sandbox user (UID 998) is active +- The SSH tunnel through the gateway works end-to-end +- Workspace init, supervisor startup, network namespace creation, and proxy all function correctly under user namespace isolation + +## 10. Cleanup + +```shell +# Delete all sandboxes +kubectl delete sandbox --all -n openshell + +# Uninstall the Helm release +helm uninstall openshell -n openshell + +# Remove the supervisor installer +kubectl delete daemonset openshell-supervisor-installer -n openshell + +# Remove RBAC +kubectl delete clusterrolebinding openshell-sa-anyuid openshell-sa-privileged \ + openshell-default-privileged agent-sandbox-admin 2>/dev/null + +# Remove the Sandbox CRD and its controller +kubectl delete -f deploy/kube/manifests/agent-sandbox.yaml + +# Remove the namespace +kubectl delete ns openshell + +# Kill port-forward +pkill -f "port-forward.*18443" + +# Remove CLI gateway config +rm -rf ~/.config/openshell/gateways/ocp-userns +``` + +## Troubleshooting + +| Symptom | Cause | Fix | +|---------|-------|-----| +| `ErrImageNeverPull` on gateway pod | Image not in the internal registry | Push with `podman push --tls-verify=false` to the OCP registry | +| `unable to validate against any security context constraint` | Missing SCC grants | Run the `clusterrolebinding` commands from step 3 | +| `cannot set blockOwnerDeletion` on sandbox creation | Sandbox CRD controller lacks RBAC | Grant `cluster-admin` to the controller SA (step 3) | +| `hostPath type check failed: /opt/openshell/bin is not a directory` | Supervisor binary not installed on node | Deploy the DaemonSet from step 6 | +| `Permission denied` accessing supervisor binary | SELinux blocking hostPath access | Ensure `chcon -t container_file_t` was applied (step 6) | +| `failed to set MOUNT_ATTR_IDMAP` | Filesystem doesn't support ID-mapped mounts | Only happens in nested container environments (DinD); native nodes work | +| Gateway pod `CrashLoopBackOff` with `unable to open database file` | PVC permissions | Use `--set server.dbUrl="sqlite:/tmp/openshell.db"` | +| `dns error: failed to lookup address` from supervisor | In-cluster DNS not resolving | Use the ClusterIP directly in `server.grpcEndpoint` instead of the DNS name | diff --git a/architecture/kubernetes-user-namespaces.md b/architecture/kubernetes-user-namespaces.md new file mode 100644 index 000000000..081e379b5 --- /dev/null +++ b/architecture/kubernetes-user-namespaces.md @@ -0,0 +1,163 @@ +# Kubernetes User Namespace Support + +## Context + +Kubernetes v1.36 graduated user namespace support to GA (`spec.hostUsers: false`). This feature maps container UID 0 to an unprivileged host UID, making capabilities like `CAP_SYS_ADMIN` container-scoped rather than host-scoped. This is a significant defense-in-depth improvement for OpenShell sandbox pods, which currently require `SYS_ADMIN`, `NET_ADMIN`, `SYS_PTRACE`, and `SYSLOG` capabilities. + +The sandbox supervisor already runs as UID 0 inside the container and performs all privileged operations (namespace creation, seccomp, Landlock) locally — user namespaces confine these powers to the container without breaking functionality. + +## Design + +**Two-layer configuration:** + +- Cluster-wide default: `enable_user_namespaces` on `Config` / `KubernetesComputeConfig` (env var `OPENSHELL_ENABLE_USER_NAMESPACES`, default `false`) +- Per-sandbox override: `optional bool user_namespaces` on `SandboxTemplate` in the proto, translated to `platform_config.host_users` for the K8s driver + +**Capability additions when enabled:** Add `SETUID`, `SETGID`, `DAC_READ_SEARCH` to the pod security context (matching the Podman driver at `crates/openshell-driver-podman/src/container.rs:393-400`) — needed because the bounding set is reset inside a user namespace. + +**No changes to:** seccomp filters (CLONE_NEWUSER block stays), Landlock, supervisor privilege-drop logic, init containers, and workspace volume ownership semantics (ID-mapped mounts handle ownership transparently). The only mount-related change is the supervisor `hostPath` type in Step 7. + +## Changes + +### 1. Proto: add `user_namespaces` field to `SandboxTemplate` + +**File:** `proto/openshell.proto` + +Add `optional bool user_namespaces = 10;` to the `SandboxTemplate` message. Using `optional` distinguishes "not set" (use cluster default) from explicit true/false. + +### 2. Core config: add `enable_user_namespaces` to server config + +**File:** `crates/openshell-core/src/config.rs` + +Add field to `Config`: + +```rust +#[serde(default)] +pub enable_user_namespaces: bool, +``` + +Wire the env var `OPENSHELL_ENABLE_USER_NAMESPACES` (clap handles this on the standalone driver binary; for the in-process server path, `Config` serde does it). + +### 3. K8s driver config: add field + +**File:** `crates/openshell-driver-kubernetes/src/config.rs` + +Add `pub enable_user_namespaces: bool` to `KubernetesComputeConfig`. + +### 4. Server: wire config and translate proto field + +**File:** `crates/openshell-server/src/lib.rs` + +Pass `config.enable_user_namespaces` into the `KubernetesComputeConfig` construction. + +**File:** `crates/openshell-server/src/compute/mod.rs` (`build_platform_config`) + +Translate the new `SandboxTemplate.user_namespaces` field into `platform_config`: + +```rust +if let Some(user_ns) = template.user_namespaces { + fields.insert("host_users".into(), Value { kind: Some(Kind::BoolValue(!user_ns)) }); +} +``` + +The public API uses `user_namespaces: true` (positive sense) while the K8s driver expects `host_users: false` (K8s convention). The driver inverts this back via `!host_users` to resolve the final pod-level `hostUsers` field. + +### 5. K8s driver: add `platform_config_bool` helper + +**File:** `crates/openshell-driver-kubernetes/src/driver.rs` + +New helper following the existing `platform_config_string` / `platform_config_struct` pattern. + +### 6. K8s driver: apply `hostUsers: false` and extended capabilities + +**File:** `crates/openshell-driver-kubernetes/src/driver.rs` + +- Pass `enable_user_namespaces` through `sandbox_to_k8s_spec` -> `sandbox_template_to_k8s` +- After the `runtimeClassName` block, resolve the effective setting: per-sandbox `platform_config.host_users` overrides cluster default +- Insert `spec.hostUsers: false` when user namespaces are enabled +- Extend the capability list with `SETUID`, `SETGID`, `DAC_READ_SEARCH` when enabled + +### 7. K8s driver: change hostPath type to `Directory` + +**File:** `crates/openshell-driver-kubernetes/src/driver.rs` (`supervisor_volume`) + +Change `"type": "DirectoryOrCreate"` to `"type": "Directory"`. The supervisor path is pre-provisioned during cluster setup; `DirectoryOrCreate` could fail under user namespaces when the mapped UID can't create host directories. + +### 8. Standalone driver binary: wire CLI arg + +**File:** `crates/openshell-driver-kubernetes/src/main.rs` + +Add `#[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")]` and pass to config construction. + +### 9. Helm chart + +**File:** `deploy/helm/openshell/values.yaml` — add `enableUserNamespaces: false` under `server:` + +**File:** `deploy/helm/openshell/templates/statefulset.yaml` — add conditional env var block: + +```yaml +{{- if .Values.server.enableUserNamespaces }} +- name: OPENSHELL_ENABLE_USER_NAMESPACES + value: "true" +{{- end }} +``` + +## Risks + +| Risk | Mitigation | +|------|------------| +| GPU + user namespaces may conflict (NVIDIA device plugin) | Log a warning when both `gpu: true` and user namespaces are enabled; test before enabling by default | +| hostPath volume ownership with ID-mapped mounts | Step 7 changes to `Directory` type; mount is read-only so ownership doesn't matter for execution | +| sysfs remount in netns setup | Already avoided -- code uses `nsenter` instead of `ip netns exec` (documented at `netns.rs:685`) | +| Requires Linux 5.12+ and supporting runtime | Feature defaults to `false`; failure mode is a clear Kubernetes pod event | +| Nested container environments (DinD / k3s-in-Docker) | Does not work in the local dev cluster; see section below | + +## Nested k3s / Docker-in-Docker limitation + +User namespaces require **ID-mapped mounts** (Linux 5.12+) so the kernel can transparently remap file ownership between the container's UID space and the host's UID space. When k3s runs inside a Docker container (the `mise run cluster` dev environment), the inner container's root filesystem sits on an overlayfs layer managed by the outer Docker daemon. The overlayfs driver in this nested configuration does not support `MOUNT_ATTR_IDMAP`, so `runc` fails at container init: + +``` +failed to set MOUNT_ATTR_IDMAP on .../etc-hosts: invalid argument +(maybe the filesystem used doesn't support idmap mounts on this kernel?) +``` + +This is a kernel/filesystem constraint, not an OpenShell bug. The pod spec is generated correctly (`hostUsers: false`, extended capabilities), but the container runtime cannot fulfil the mount request. + +**Where user namespaces work:** + +- Bare-metal or VM-based Kubernetes clusters where the node's root filesystem is ext4/xfs/btrfs (all support ID-mapped mounts since Linux 5.12-5.19). +- Managed Kubernetes services (EKS, GKE, AKS) on nodes running a supported kernel. + +**Where they do not work:** + +- k3s-in-Docker / kind / Docker-in-Docker dev clusters where the inner container uses overlayfs on top of the outer container's overlayfs. The nested overlayfs does not support `MOUNT_ATTR_IDMAP`. +- Nodes running kernels older than 5.12. +- Nodes using filesystems that have not added ID-mapped mount support (e.g., NFS on older kernels). + +The e2e test (`e2e/rust/tests/user_namespaces.rs`) accounts for this by verifying only the pod spec fields (`hostUsers`, capabilities) rather than attempting to run a command inside the sandbox. + +## Deploying to a real cluster with Helm + +User namespaces can be tested end-to-end on Kubernetes 1.33+ clusters where the feature is available (beta through 1.35, GA in 1.36+) with a supporting container runtime. Deploy the gateway with Helm and set `server.enableUserNamespaces=true`: + +```shell +helm install openshell deploy/helm/openshell -n openshell \ + --set server.enableUserNamespaces=true \ + --set server.sandboxImage="ghcr.io/nvidia/openshell-community/sandboxes/base:latest" \ + ... +``` + +The supervisor binary must be present at `/opt/openshell/bin/openshell-sandbox` on every node (hostPath mount). On SELinux-enforcing nodes (RHEL, CoreOS), label it with `chcon -t container_file_t`. + +This has been validated end-to-end on OCP 4.22 (K8s 1.35.3, CRI-O 1.35, RHEL CoreOS, kernel 5.14) with full SSH tunnel, workspace init, and sandbox command execution under user namespace isolation. See [kubernetes-user-namespaces-ocp-testing.md](kubernetes-user-namespaces-ocp-testing.md) for the complete step-by-step reproduction guide. + +## Verification + +1. `mise run pre-commit` -- lint and format pass +2. `mise run test` -- unit tests pass including new tests for: + - `hostUsers: false` present/absent in generated pod spec based on config combinations + - Extended capability list when user namespaces enabled + - `platform_config_bool` helper + - `Directory` type on supervisor volume +3. `mise run e2e` -- the `user_namespaces` test verifies pod spec correctness against the local dev cluster +4. On a Kubernetes 1.33+ cluster with user namespace support available (OCP, GKE, EKS, bare-metal): deploy with Helm, create a sandbox, and verify `cat /proc/self/uid_map` shows a non-identity mapping (UID 0 maps to a high host UID) diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index 2fbdb1b1d..c469e3aef 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -192,6 +192,14 @@ pub struct Config { /// allowing them to reach services running on the Docker host. #[serde(default)] pub host_gateway_ip: String, + + /// Enable Kubernetes user namespace isolation (`hostUsers: false`) for + /// sandbox pods. When enabled, container UID 0 maps to an unprivileged + /// host UID and capabilities become namespaced. Requires Kubernetes 1.33+ + /// with user namespace support available (beta through 1.35, GA in 1.36+), + /// plus a supporting container runtime and Linux 5.12+. + #[serde(default)] + pub enable_user_namespaces: bool, } /// TLS configuration. @@ -245,6 +253,7 @@ impl Config { ssh_session_ttl_secs: default_ssh_session_ttl_secs(), client_tls_secret_name: String::new(), host_gateway_ip: String::new(), + enable_user_namespaces: false, } } diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index be3666130..f20e7da73 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -12,4 +12,5 @@ pub struct KubernetesComputeConfig { pub ssh_handshake_skew_secs: u64, pub client_tls_secret_name: String, pub host_gateway_ip: String, + pub enable_user_namespaces: bool, } diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 444e0f55d..6e4a6f13f 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -319,6 +319,7 @@ impl KubernetesComputeDriver { self.ssh_handshake_skew_secs(), &self.config.client_tls_secret_name, &self.config.host_gateway_ip, + self.config.enable_user_namespaces, ); let api = self.api(); @@ -674,7 +675,7 @@ fn supervisor_volume() -> serde_json::Value { "name": SUPERVISOR_VOLUME_NAME, "hostPath": { "path": SUPERVISOR_HOST_PATH, - "type": "DirectoryOrCreate" + "type": "Directory" } }) } @@ -887,6 +888,7 @@ fn sandbox_to_k8s_spec( ssh_handshake_skew_secs: u64, client_tls_secret_name: &str, host_gateway_ip: &str, + enable_user_namespaces: bool, ) -> serde_json::Value { let mut root = serde_json::Map::new(); @@ -929,6 +931,7 @@ fn sandbox_to_k8s_spec( client_tls_secret_name, host_gateway_ip, inject_workspace, + enable_user_namespaces, ), ); if !template.agent_socket_path.is_empty() { @@ -975,6 +978,7 @@ fn sandbox_to_k8s_spec( client_tls_secret_name, host_gateway_ip, inject_workspace, + enable_user_namespaces, ), ); } @@ -1000,6 +1004,7 @@ fn sandbox_template_to_k8s( client_tls_secret_name: &str, host_gateway_ip: &str, inject_workspace: bool, + enable_user_namespaces: bool, ) -> serde_json::Value { // The supervisor binary is always side-loaded from the node filesystem // via a hostPath volume, regardless of which sandbox image is used. @@ -1020,6 +1025,21 @@ fn sandbox_template_to_k8s( ); } + // Per-sandbox platform_config.host_users overrides the cluster-wide default. + let use_user_namespaces = platform_config_bool(template, "host_users") + .map(|host_users| !host_users) + .unwrap_or(enable_user_namespaces); + + if use_user_namespaces { + spec.insert("hostUsers".to_string(), serde_json::json!(false)); + if gpu { + warn!( + "GPU sandbox with user namespaces enabled — \ + NVIDIA device plugin compatibility is unverified" + ); + } + } + let mut container = serde_json::Map::new(); container.insert("name".to_string(), serde_json::json!("agent")); // Use template image if provided, otherwise fall back to default @@ -1054,17 +1074,19 @@ fn sandbox_template_to_k8s( container.insert("env".to_string(), serde_json::Value::Array(env)); - // The sandbox process needs SYS_ADMIN (for seccomp filter installation and - // network namespace creation), NET_ADMIN (for network namespace veth setup), - // SYS_PTRACE (for the CONNECT proxy to read /proc//fd/ of sandbox-user - // processes to resolve binary identity for network policy enforcement), - // and SYSLOG (for reading /dev/kmsg to surface bypass detection diagnostics). - // This mirrors the capabilities used by `mise run sandbox`. + let mut capabilities: Vec<&str> = vec!["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"]; + if use_user_namespaces { + // In a user namespace the bounding set is reset. SETUID/SETGID are + // needed for the supervisor to drop privileges to the sandbox user. + // DAC_READ_SEARCH is needed for cross-UID /proc//fd/ access + // for process identity resolution in network policy enforcement. + capabilities.extend(["SETUID", "SETGID", "DAC_READ_SEARCH"]); + } container.insert( "securityContext".to_string(), serde_json::json!({ "capabilities": { - "add": ["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"] + "add": capabilities } }), ); @@ -1288,6 +1310,15 @@ fn platform_config_string(template: &SandboxTemplate, key: &str) -> Option Option { + let config = template.platform_config.as_ref()?; + let value = config.fields.get(key)?; + match value.kind.as_ref() { + Some(prost_types::value::Kind::BoolValue(b)) => Some(*b), + _ => None, + } +} + /// Extract a nested Struct value from the template's `platform_config`, /// converting it to `serde_json::Value`. fn platform_config_struct(template: &SandboxTemplate, key: &str) -> Option { @@ -1494,7 +1525,7 @@ mod tests { assert_eq!(volumes.len(), 1); assert_eq!(volumes[0]["name"], SUPERVISOR_VOLUME_NAME); assert_eq!(volumes[0]["hostPath"]["path"], SUPERVISOR_HOST_PATH); - assert_eq!(volumes[0]["hostPath"]["type"], "DirectoryOrCreate"); + assert_eq!(volumes[0]["hostPath"]["type"], "Directory"); // Agent container command should be overridden let command = pod_template["spec"]["containers"][0]["command"] @@ -1580,6 +1611,7 @@ mod tests { "", "", true, + false, ); assert_eq!( @@ -1623,6 +1655,7 @@ mod tests { "", "", true, + false, ); assert_eq!( @@ -1662,6 +1695,7 @@ mod tests { "", "", true, + false, ); assert_eq!( @@ -1696,6 +1730,7 @@ mod tests { "", "", true, + false, ); let limits = &pod_template["spec"]["containers"][0]["resources"]["limits"]; @@ -1723,6 +1758,7 @@ mod tests { "", "172.17.0.1", true, + false, ); let host_aliases = pod_template["spec"]["hostAliases"] @@ -1754,6 +1790,7 @@ mod tests { "", "", true, + false, ); assert!( @@ -1780,6 +1817,7 @@ mod tests { "my-tls-secret", "", true, + false, ); let volumes = pod_template["spec"]["volumes"] @@ -1925,6 +1963,7 @@ mod tests { "", "", false, // user provided custom VCTs + false, ); // No init container should be present @@ -1947,4 +1986,209 @@ mod tests { "workspace mount must NOT be present when inject_workspace is false" ); } + + // ----------------------------------------------------------------------- + // User namespace tests + // ----------------------------------------------------------------------- + + fn default_template_to_k8s(enable_user_namespaces: bool) -> serde_json::Value { + sandbox_template_to_k8s( + &SandboxTemplate::default(), + false, + "openshell/sandbox:latest", + "", + "sandbox-id", + "sandbox-name", + "https://gateway.example.com", + "0.0.0.0:2222", + "secret", + 300, + &std::collections::HashMap::new(), + "", + "", + true, + enable_user_namespaces, + ) + } + + #[test] + fn user_namespaces_disabled_by_default() { + let pod_template = default_template_to_k8s(false); + assert!( + pod_template["spec"]["hostUsers"].is_null(), + "hostUsers must not be set when user namespaces are disabled" + ); + let caps = pod_template["spec"]["containers"][0]["securityContext"]["capabilities"]["add"] + .as_array() + .unwrap(); + assert_eq!(caps.len(), 4); + assert!(!caps.contains(&serde_json::json!("SETUID"))); + } + + #[test] + fn user_namespaces_enabled_by_cluster_default() { + let pod_template = default_template_to_k8s(true); + assert_eq!( + pod_template["spec"]["hostUsers"], + serde_json::json!(false), + "hostUsers must be false when user namespaces are enabled" + ); + } + + #[test] + fn user_namespaces_adds_extra_capabilities() { + let pod_template = default_template_to_k8s(true); + let caps = pod_template["spec"]["containers"][0]["securityContext"]["capabilities"]["add"] + .as_array() + .unwrap(); + assert!(caps.contains(&serde_json::json!("SYS_ADMIN"))); + assert!(caps.contains(&serde_json::json!("NET_ADMIN"))); + assert!(caps.contains(&serde_json::json!("SYS_PTRACE"))); + assert!(caps.contains(&serde_json::json!("SYSLOG"))); + assert!(caps.contains(&serde_json::json!("SETUID"))); + assert!(caps.contains(&serde_json::json!("SETGID"))); + assert!(caps.contains(&serde_json::json!("DAC_READ_SEARCH"))); + assert_eq!(caps.len(), 7); + } + + #[test] + fn user_namespaces_per_sandbox_override_enables() { + let template = SandboxTemplate { + platform_config: Some(Struct { + fields: [( + "host_users".to_string(), + Value { + kind: Some(Kind::BoolValue(false)), + }, + )] + .into_iter() + .collect(), + }), + ..SandboxTemplate::default() + }; + + let pod_template = sandbox_template_to_k8s( + &template, + false, + "openshell/sandbox:latest", + "", + "sandbox-id", + "sandbox-name", + "https://gateway.example.com", + "0.0.0.0:2222", + "secret", + 300, + &std::collections::HashMap::new(), + "", + "", + true, + false, // cluster default is off + ); + + assert_eq!( + pod_template["spec"]["hostUsers"], + serde_json::json!(false), + "per-sandbox host_users: false must enable user namespaces" + ); + let caps = pod_template["spec"]["containers"][0]["securityContext"]["capabilities"]["add"] + .as_array() + .unwrap(); + assert!(caps.contains(&serde_json::json!("SETUID"))); + } + + #[test] + fn user_namespaces_per_sandbox_override_disables() { + let template = SandboxTemplate { + platform_config: Some(Struct { + fields: [( + "host_users".to_string(), + Value { + kind: Some(Kind::BoolValue(true)), + }, + )] + .into_iter() + .collect(), + }), + ..SandboxTemplate::default() + }; + + let pod_template = sandbox_template_to_k8s( + &template, + false, + "openshell/sandbox:latest", + "", + "sandbox-id", + "sandbox-name", + "https://gateway.example.com", + "0.0.0.0:2222", + "secret", + 300, + &std::collections::HashMap::new(), + "", + "", + true, + true, // cluster default is on + ); + + assert!( + pod_template["spec"]["hostUsers"].is_null(), + "per-sandbox host_users: true must disable user namespaces even when cluster default is on" + ); + let caps = pod_template["spec"]["containers"][0]["securityContext"]["capabilities"]["add"] + .as_array() + .unwrap(); + assert_eq!( + caps.len(), + 4, + "extra capabilities must not be added when user namespaces are disabled" + ); + } + + #[test] + fn platform_config_bool_extracts_value() { + let template = SandboxTemplate { + platform_config: Some(Struct { + fields: [( + "my_bool".to_string(), + Value { + kind: Some(Kind::BoolValue(true)), + }, + )] + .into_iter() + .collect(), + }), + ..SandboxTemplate::default() + }; + + assert_eq!(platform_config_bool(&template, "my_bool"), Some(true)); + assert_eq!(platform_config_bool(&template, "missing"), None); + } + + #[test] + fn platform_config_bool_returns_none_for_non_bool() { + let template = SandboxTemplate { + platform_config: Some(Struct { + fields: [( + "a_string".to_string(), + Value { + kind: Some(Kind::StringValue("hello".to_string())), + }, + )] + .into_iter() + .collect(), + }), + ..SandboxTemplate::default() + }; + + assert_eq!(platform_config_bool(&template, "a_string"), None); + } + + #[test] + fn supervisor_volume_uses_directory_type() { + let vol = supervisor_volume(); + assert_eq!( + vol["hostPath"]["type"], "Directory", + "supervisor hostPath must use Directory (not DirectoryOrCreate) for user namespace compatibility" + ); + } } diff --git a/crates/openshell-driver-kubernetes/src/main.rs b/crates/openshell-driver-kubernetes/src/main.rs index 4b871d77f..06d943807 100644 --- a/crates/openshell-driver-kubernetes/src/main.rs +++ b/crates/openshell-driver-kubernetes/src/main.rs @@ -57,6 +57,9 @@ struct Args { #[arg(long, env = "OPENSHELL_HOST_GATEWAY_IP")] host_gateway_ip: Option, + + #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] + enable_user_namespaces: bool, } #[tokio::main] @@ -78,6 +81,7 @@ async fn main() -> Result<()> { ssh_handshake_skew_secs: args.ssh_handshake_skew_secs, client_tls_secret_name: args.client_tls_secret_name.unwrap_or_default(), host_gateway_ip: args.host_gateway_ip.unwrap_or_default(), + enable_user_namespaces: args.enable_user_namespaces, }) .await .into_diagnostic()?; diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 2e6e2823b..cbc9e6e29 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -177,6 +177,11 @@ struct Args { #[arg(long, env = "OPENSHELL_VM_TLS_KEY")] vm_tls_key: Option, + /// Enable Kubernetes user namespace isolation (hostUsers: false) for + /// sandbox pods. + #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] + enable_user_namespaces: bool, + /// Disable TLS entirely — listen on plaintext HTTP. /// Use this when the gateway sits behind a reverse proxy or tunnel /// (e.g. Cloudflare Tunnel) that terminates TLS at the edge. @@ -304,6 +309,8 @@ async fn run_from_args(args: Args) -> Result<()> { config = config.with_host_gateway_ip(ip); } + config.enable_user_namespaces = args.enable_user_namespaces; + let vm_config = VmComputeConfig { state_dir: args.vm_driver_state_dir, driver_dir: args.driver_dir, diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 19cfd5faf..3f32744f3 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -955,6 +955,19 @@ fn build_platform_config(template: &SandboxTemplate) -> Option/exe` (the kernel-trusted executable path). diff --git a/e2e/rust/tests/user_namespaces.rs b/e2e/rust/tests/user_namespaces.rs new file mode 100644 index 000000000..9aa714767 --- /dev/null +++ b/e2e/rust/tests/user_namespaces.rs @@ -0,0 +1,190 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e")] + +//! E2E test: verify Kubernetes user namespace pod spec generation. +//! +//! Enables `OPENSHELL_ENABLE_USER_NAMESPACES` on the gateway, triggers sandbox +//! creation, and inspects the resulting pod spec to confirm: +//! 1. `spec.hostUsers` is `false` +//! 2. The container security context includes the extra capabilities +//! (SETUID, SETGID, DAC_READ_SEARCH) required for user namespace operation +//! +//! The sandbox pod may fail to start in Docker-in-Docker dev clusters where the +//! filesystem does not support ID-mapped mounts. The test inspects the pod spec +//! regardless of runtime success. + +use std::process::Stdio; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +use openshell_e2e::harness::binary::openshell_cmd; +use tokio::process::Child; + +async fn kubectl(args: &[&str]) -> Result { + let output = tokio::process::Command::new("docker") + .args(["exec", "openshell-cluster-openshell", "kubectl"]) + .args(args) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .await + .map_err(|e| format!("failed to run kubectl: {e}"))?; + + let stdout = String::from_utf8_lossy(&output.stdout).to_string(); + let stderr = String::from_utf8_lossy(&output.stderr).to_string(); + + if !output.status.success() { + return Err(format!("kubectl {args:?} failed: {stdout}{stderr}")); + } + Ok(stdout) +} + +async fn set_user_namespaces(enable: bool) -> Result<(), String> { + let env_arg = if enable { + "OPENSHELL_ENABLE_USER_NAMESPACES=true" + } else { + "OPENSHELL_ENABLE_USER_NAMESPACES-" + }; + + kubectl(&[ + "set", "env", "statefulset/openshell", + "-n", "openshell", env_arg, + ]).await?; + + kubectl(&[ + "rollout", "status", "statefulset/openshell", + "-n", "openshell", "--timeout=120s", + ]).await?; + + // Give the gateway time to fully initialize after rollout. + tokio::time::sleep(Duration::from_secs(5)).await; + + Ok(()) +} + +async fn delete_sandbox(name: &str) { + let _ = kubectl(&["delete", "sandbox", name, "-n", "openshell"]).await; +} + +fn unique_sandbox_name() -> String { + let suffix = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis(); + format!("userns-e2e-{suffix}") +} + +async fn stop_child(child: &mut Child) { + let _ = child.kill().await; + let _ = child.wait().await; +} + +async fn wait_for_sandbox(name: &str, timeout_secs: u64) -> Result<(), String> { + let deadline = tokio::time::Instant::now() + Duration::from_secs(timeout_secs); + while tokio::time::Instant::now() < deadline { + if let Ok(n) = kubectl(&[ + "get", "sandbox", name, "-n", "openshell", + "-o", "jsonpath={.metadata.name}", + ]).await { + if !n.trim().is_empty() { + return Ok(()); + } + } + tokio::time::sleep(Duration::from_secs(2)).await; + } + Err(format!("sandbox {name} did not appear within {timeout_secs}s")) +} + +/// Find a sandbox pod by its sandbox CRD name. The CRD controller creates a +/// pod with the same name as the Sandbox resource. +async fn wait_for_sandbox_pod(name: &str, timeout_secs: u64) -> Result<(), String> { + let deadline = tokio::time::Instant::now() + Duration::from_secs(timeout_secs); + while tokio::time::Instant::now() < deadline { + if let Ok(n) = kubectl(&[ + "get", "pod", name, "-n", "openshell", + "-o", "jsonpath={.metadata.name}", + ]).await { + if !n.trim().is_empty() { + return Ok(()); + } + } + tokio::time::sleep(Duration::from_secs(2)).await; + } + Err(format!("sandbox pod {name} did not appear within {timeout_secs}s")) +} + +#[tokio::test] +async fn sandbox_pod_spec_has_user_namespace_fields() { + // Enable user namespaces on the gateway. + set_user_namespaces(true) + .await + .expect("failed to enable user namespaces on gateway"); + + let sandbox_name = unique_sandbox_name(); + + // Start sandbox creation in the background. The pod may never become + // ready in DinD environments, so we spawn the CLI and inspect the pod + // spec independently. + let mut cmd = openshell_cmd(); + cmd.arg("sandbox").arg("create") + .arg("--name").arg(&sandbox_name) + .arg("--").arg("sleep").arg("infinity"); + cmd.stdout(Stdio::piped()).stderr(Stdio::piped()); + + let mut child = cmd.spawn().expect("failed to spawn openshell create"); + + if let Err(e) = wait_for_sandbox(&sandbox_name, 60).await { + stop_child(&mut child).await; + delete_sandbox(&sandbox_name).await; + set_user_namespaces(false).await.ok(); + panic!("{e}"); + } + + // Wait for the pod to be created (the CRD controller creates it). + if let Err(e) = wait_for_sandbox_pod(&sandbox_name, 60).await { + stop_child(&mut child).await; + delete_sandbox(&sandbox_name).await; + set_user_namespaces(false).await.ok(); + panic!("{e}"); + } + + // Inspect the pod spec for hostUsers. + let host_users = kubectl(&[ + "get", "pod", &sandbox_name, "-n", "openshell", + "-o", "jsonpath={.spec.hostUsers}", + ]).await; + + // Inspect capabilities on the agent container. + let caps = kubectl(&[ + "get", "pod", &sandbox_name, "-n", "openshell", + "-o", "jsonpath={.spec.containers[?(@.name=='agent')].securityContext.capabilities.add}", + ]).await; + + // Clean up. + stop_child(&mut child).await; + delete_sandbox(&sandbox_name).await; + set_user_namespaces(false).await.ok(); + + // Assert hostUsers is false. + let host_users_val = host_users.expect("failed to get hostUsers from pod spec"); + assert_eq!( + host_users_val.trim(), "false", + "sandbox pod must have spec.hostUsers=false when user namespaces are enabled" + ); + + // Assert extra capabilities are present. + let caps_val = caps.expect("failed to get capabilities from pod spec"); + for cap in ["SETUID", "SETGID", "DAC_READ_SEARCH"] { + assert!( + caps_val.contains(cap), + "sandbox pod must include {cap} in capabilities when user namespaces are enabled, got: {caps_val}" + ); + } + for cap in ["SYS_ADMIN", "NET_ADMIN", "SYS_PTRACE", "SYSLOG"] { + assert!( + caps_val.contains(cap), + "sandbox pod must include {cap} in capabilities, got: {caps_val}" + ); + } +} diff --git a/proto/openshell.proto b/proto/openshell.proto index 2434f1a80..b53c71ba7 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -227,6 +227,12 @@ message SandboxTemplate { google.protobuf.Struct resources = 7; // Optional platform-specific volume claim templates. google.protobuf.Struct volume_claim_templates = 9; + // Enable Kubernetes user namespace isolation (hostUsers: false). + // When true, container UID 0 maps to a non-root host UID and capabilities + // become namespaced. Requires Kubernetes 1.33+ with user namespace support + // available (beta through 1.35, GA in 1.36+) and a supporting runtime. + // When unset, the cluster-wide default is used. + optional bool user_namespaces = 10; } // User-facing sandbox status derived by the gateway from compute-driver observations.