On Tue, 11 Oct 2022 16:40:15 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:@@ -251,7 +275,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Drop capabilites in our new userns */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so that we can setns() to the + * netns when we need to act upon it + */ + ns_caps |= 1UL << CAP_SYS_ADMIN; + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= 1UL << CAP_NET_BIND_SERVICE; + } + + drop_caps_ep_except(ns_caps);Hmm, I didn't really look into this yet, but there seems to be an issue with filesystem-bound network namespaces now. Running something like: pasta --config-net --netns /run/user/1000/netns/netns-6466ff4b-1efc-2b58-685b-cbc12feb9ccc (from Podman), this happens: readlink("/proc/self/exe", "/usr/local/bin/passt.avx2", 4095) = 25 capget({version=_LINUX_CAPABILITY_VERSION_3, pid=0}, {effective=1<<CAP_CHOWN|1<<CAP_DAC_OVERRIDE|1<<CAP_DAC_READ_SEARCH|1<<CAP_FOWNER|1<<CAP_FSETID|1<<CAP_KILL|1<<CAP_SETGID|1<<CAP_SETUID|1<<CAP_SETPCAP|1<<CAP_LINUX_IMMUTABLE|1<<CAP_NET_BIND_SERVICE|1<<CAP_NET_BROADCAST|1<<CAP_NET_ADMIN|1<<CAP_NET_RAW|1<<CAP_IPC_LOCK|1<<CAP_IPC_OWNER|1<<CAP_SYS_MODULE|1<<CAP_SYS_RAWIO|1<<CAP_SYS_CHROOT|1<<CAP_SYS_PTRACE|1<<CAP_SYS_PACCT|1<<CAP_SYS_ADMIN|1<<CAP_SYS_BOOT|1<<CAP_SYS_NICE|1<<CAP_SYS_RESOURCE|1<<CAP_SYS_TIME|1<<CAP_SYS_TTY_CONFIG|1<<CAP_MKNOD|1<<CAP_LEASE|1<<CAP_AUDIT_WRITE|1<<CAP_AUDIT_CONTROL|1<<CAP_SETFCAP|1<<CAP_MAC_OVERRIDE|1<<CAP_MAC_ADMIN|1<<CAP_SYSLOG|1<<CAP_WAKE_ALARM|1<<CAP_BLOCK_SUSPEND|1<<CAP_AUDIT_READ|1<<CAP_PERFMON|1<<CAP_BPF|1<<CAP_CHECKPOINT_RESTORE, permitted=1<<CAP_CHOWN|1<<CAP_DAC_OVERRIDE|1<<CAP_DAC_READ_SEARCH|1<<CAP_FOWNER|1<<CAP_FSETID|1<<CAP_KILL|1<<CAP_SETGID|1<<CAP_SETUID|1<<CAP_SETPCAP|1<<CAP_LINUX_IMMUTABLE|1<<CAP_NET_BIND_SERVICE|1<<CAP_NET_BROADCAST|1<<CAP_N ET_ADMIN|1<<CAP_NET_RAW|1<<CAP_IPC_LOCK|1<<CAP_IPC_OWNER|1<<CAP_SYS_MODULE|1<<CAP_SYS_RAWIO|1<<CAP_SYS_CHROOT|1<<CAP_SYS_PTRACE|1<<CAP_SYS_PACCT|1<<CAP_SYS_ADMIN|1<<CAP_SYS_BOOT|1<<CAP_SYS_NICE|1<<CAP_SYS_RESOURCE|1<<CAP_SYS_TIME|1<<CAP_SYS_TTY_CONFIG|1<<CAP_MKNOD|1<<CAP_LEASE|1<<CAP_AUDIT_WRITE|1<<CAP_AUDIT_CONTROL|1<<CAP_SETFCAP|1<<CAP_MAC_OVERRIDE|1<<CAP_MAC_ADMIN|1<<CAP_SYSLOG|1<<CAP_WAKE_ALARM|1<<CAP_BLOCK_SUSPEND|1<<CAP_AUDIT_READ|1<<CAP_PERFMON|1<<CAP_BPF|1<<CAP_CHECKPOINT_RESTORE, inheritable=0}) = 0 capset({version=_LINUX_CAPABILITY_VERSION_3, pid=0}, {effective=1<<CAP_NET_BIND_SERVICE, permitted=1<<CAP_NET_BIND_SERVICE, inheritable=0}) = 0 [...] getegid() = 0 openat(AT_FDCWD, "/proc/self/uid_map", O_RDONLY|O_CLOEXEC) = 7 read(7, " 0 1000 1"..., 8192) = 66 close(7) = 0 setgroups(0, NULL) = -1 EPERM (Operation not permitted) setgid(0) = 0 setuid(0) = 0 openat(AT_FDCWD, "/run/user/1000/netns/netns-6466ff4b-1efc-2b58-685b-cbc12feb9ccc", O_RDONLY|O_CLOEXEC) = 7 clone(child_stack=0x7ffef5a229a0, flags=CLONE_VM|CLONE_FILES|CLONE_VFORK|SIGCHLDstrace: Process 1763223 attached <unfinished ...> [pid 1763223] setns(7, CLONE_NEWNET) = -1 EPERM (Operation not permitted) [pid 1763223] exit(0) = ? [pid 1763222] <... clone resumed>) = 1763223 [pid 1763223] +++ exited with 0 +++ --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=1763223, si_uid=0, si_status=0, si_utime=0, si_stime=0} --- waitid(P_ALL, 0, NULL, WNOHANG|WEXITED, NULL) = 0 waitid(P_ALL, 0, NULL, WNOHANG|WEXITED, NULL) = -1 ECHILD (No child processes) rt_sigreturn({mask=[]}) = 1763223 sendto(5, "<3> Couldn't switch to pasta nam"..., 40, 0, NULL, 0) = 40 write(2, "Couldn't switch to pasta namespa"..., 35Couldn't switch to pasta namespaces) = 35 write(2, "\n", 1 -- Stefano