Our current handling of capabilities isn't quite right. In particular, drop_caps() attempts to remove capabilities from the bounding set, which usually won't work, and even if it does won't have the effect we want. This series corrects that, as well as making some other fixes and cleanups in adjacent code. Changes since v1: * A number of minor style fixes * Should now correctly handle the case where we join an existing userns/netns, or just an existing netns (--netns-only) David Gibson (11): test: Move slower tests to end of test run pasta: More general way of starting spawned shell as a login shell pasta_start_ns() always ends in parent context Remove unhelpful drop_caps() call in pasta_start_ns() isolation: Clarify various self-isolation steps Replace FWRITE with a function isolation: Refactor isolate_user() to allow for a common exit path isolation: Replace drop_caps() with a version that actually does something isolation: Prevent any child processes gaining capabilities isolation: Only configure UID/GID mappings in userns when spawning shell Rename pasta_setup_ns() to pasta_spawn_cmd() conf.c | 5 +- isolation.c | 255 +++++++++++++++++++++++++++++++++++++++++++++------- isolation.h | 9 +- passt.c | 8 +- pasta.c | 72 +++++++++------ pasta.h | 3 +- test/run | 20 ++--- util.c | 33 +++++++ util.h | 13 +-- 9 files changed, 324 insertions(+), 94 deletions(-) -- 2.37.3
The distro and performance tests are by far the slowest part of the passt testsuite. Move them to the end of the testsuite run, so that it's easier to do a quick test during development by letting the other tests run then interrupting the test runner. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- test/run | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/run b/test/run index 4bb9cd8..d1f24ef 100755 --- a/test/run +++ b/test/run @@ -65,13 +65,6 @@ run() { test build/clang_tidy teardown build - setup distro - test distro/debian - test distro/fedora - test distro/opensuse - test distro/ubuntu - teardown distro - setup pasta test pasta/ndp test pasta/dhcp @@ -98,6 +91,10 @@ run() { test passt_in_ns/shutdown teardown passt_in_ns + setup two_guests + test two_guests/basic + teardown two_guests + VALGRIND=0 setup passt_in_ns test passt/ndp @@ -109,9 +106,12 @@ run() { test passt_in_ns/shutdown teardown passt_in_ns - setup two_guests - test two_guests/basic - teardown two_guests + setup distro + test distro/debian + test distro/fedora + test distro/opensuse + test distro/ubuntu + teardown distro perf_finish [ ${CI} -eq 1 ] && video_stop -- 2.37.3
When invoked so as to spawn a shell, pasta checks explicitly for the shell being bash and if so, adds a "-l" option to make it a login shell. This is not ideal, since this is a bash specific option and requires pasta to know about specific shell variants. There's a general convention for starting a login shell, which is to prepend a "-" to argv[0]. Use this approach instead, so we don't need bash specific logic. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pasta.c b/pasta.c index 1dd8267..97cd318 100644 --- a/pasta.c +++ b/pasta.c @@ -148,10 +148,12 @@ void pasta_open_ns(struct ctx *c, const char *netns) /** * struct pasta_setup_ns_arg - Argument for pasta_setup_ns() + * @exe: Executable to run * @argv: Command and arguments to run */ struct pasta_setup_ns_arg { - char **argv; + const char *exe; + char *const *argv; }; /** @@ -162,12 +164,13 @@ struct pasta_setup_ns_arg { */ static int pasta_setup_ns(void *arg) { - struct pasta_setup_ns_arg *a = (struct pasta_setup_ns_arg *)arg; + const struct pasta_setup_ns_arg *a; FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", "Cannot set ping_group_range, ICMP requests might fail"); - execvp(a->argv[0], a->argv); + a = (const struct pasta_setup_ns_arg *)arg; + execvp(a->exe, a->argv); perror("execvp"); exit(EXIT_FAILURE); @@ -182,26 +185,31 @@ static int pasta_setup_ns(void *arg) void pasta_start_ns(struct ctx *c, int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { + .exe = argv[0], .argv = argv, }; - char *shell = getenv("SHELL"); - char *sh_argv[] = { shell, NULL }; - char *bash_argv[] = { shell, "-l", NULL }; char ns_fn_stack[NS_FN_STACK_SIZE]; + char *sh_argv[] = { NULL, NULL }; + char sh_arg0[PATH_MAX + 1]; c->foreground = 1; if (!c->debug) c->quiet = 1; - if (!shell) - shell = "/bin/sh"; if (argc == 0) { - if (strstr(shell, "/bash")) { - arg.argv = bash_argv; - } else { - arg.argv = sh_argv; + arg.exe = getenv("SHELL"); + if (!arg.exe) + arg.exe = "/bin/sh"; + + if ((size_t)snprintf(sh_arg0, sizeof(sh_arg0), + "-%s", arg.exe) >= sizeof(sh_arg0)) { + err("$SHELL is too long (%u bytes)", + strlen(arg.exe)); + exit(EXIT_FAILURE); } + sh_argv[0] = sh_arg0; + arg.argv = sh_argv; } pasta_child_pid = clone(pasta_setup_ns, -- 2.37.3
The end of pasta_start_ns() has a test against pasta_child_pid, testing if we're in the parent or the child. However we started the child running the pasta_setup_ns function which always exec()s or exit()s, so if we return from the clone() we are always in the parent, making that test unnecessary. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pasta.c b/pasta.c index 97cd318..1569590 100644 --- a/pasta.c +++ b/pasta.c @@ -225,10 +225,7 @@ void pasta_start_ns(struct ctx *c, int argc, char *argv[]) drop_caps(); - if (pasta_child_pid) { - NS_CALL(pasta_wait_for_ns, c); - return; - } + NS_CALL(pasta_wait_for_ns, c); } /** -- 2.37.3
drop_caps() has a number of bugs which mean it doesn't do what you'd expect. However, even if we fixed those, the call in pasta_start_ns() doesn't do anything useful: * In the common case, we're UID 0 at this point. In this case drop_caps() doesn't accomplish anything, because even with capabilities dropped, we are still privileged. * When attaching to an existing namespace with --userns or --netns-only we might not be UID 0. In this case it's too early to drop all capabilities: we need at least CAP_NET_ADMIN to configure the tap device in the namespace. Remove this call - we will still drop capabilities a little later in sandbox(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/pasta.c b/pasta.c index 1569590..c72efe5 100644 --- a/pasta.c +++ b/pasta.c @@ -223,8 +223,6 @@ void pasta_start_ns(struct ctx *c, int argc, char *argv[]) exit(EXIT_FAILURE); } - drop_caps(); - NS_CALL(pasta_wait_for_ns, c); } -- 2.37.3
We have a number of steps of self-isolation scattered across our code. Improve function names and add comments to make it clearer what the self isolation model is, what the steps do, and why they happen at the points they happen. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++---- isolation.h | 6 ++-- passt.c | 8 ++--- 3 files changed, 86 insertions(+), 13 deletions(-) diff --git a/isolation.c b/isolation.c index 124dea4..6ba5830 100644 --- a/isolation.c +++ b/isolation.c @@ -12,6 +12,48 @@ * Author: Stefano Brivio <sbrivio(a)redhat.com> * Author: David Gibson <david(a)gibson.dropbear.id.au> */ +/** + * DOC: Theory of Operation + * + * For security the passt/pasta process performs a number of + * self-isolations steps, dropping capabilities, setting namespaces + * and otherwise minimising the impact we can have on the system at + * large if we were compromised. + * + * Obviously we can't isolate ourselves from resources before we've + * done anything we need to do with those resources, so we have + * multiple stages of self-isolation. In order these are: + * + * 1. isolate_initial() + * ==================== + * + * Executed immediately after startup, drops capabilities we don't + * need at any point during execution (or which we gain back when we + * need by joining other namespaces). + * + * 2. isolate_user() + * ================= + * + * Executed once we know what user and user namespace we want to + * operate in. Sets our final UID & GID, and enters the correct user + * namespace. + * + * 3. isolate_prefork() + * ==================== + * + * Executed after all setup, but before daemonising (fork()ing into + * the background). Uses mount namespace and pivot_root() to remove + * our access to the filesystem. + * + * 4. isolate_postfork() + * ===================== + * + * Executed immediately after daemonizing, but before entering the + * actual packet forwarding phase of operation. Or, if not + * daemonizing, immediately after isolate_prefork(). Uses seccomp() + * to restrict ourselves to the handful of syscalls we need during + * runtime operation. + */ #include <errno.h> #include <fcntl.h> @@ -47,7 +89,7 @@ /** * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE */ -void drop_caps(void) +static void drop_caps(void) { int i; @@ -59,12 +101,31 @@ void drop_caps(void) } } +/** + * isolate_initial() - Early, config independent self isolation + * + * Should: + * - drop unneeded capabilities + * Musn't: + * - remove filesytem access (we need to access files during setup) + */ +void isolate_initial(void) +{ + drop_caps(); +} + /** * isolate_user() - Switch to final UID/GID and move into userns * @uid: User ID to run as (in original userns) * @gid: Group ID to run as (in original userns) * @use_userns: Whether to join or create a userns * @userns: userns path to enter, may be empty + * + * Should: + * - set our final UID and GID + * - enter our final user namespace + * Mustn't: + * - remove filesystem access (we need that for further setup) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { @@ -134,11 +195,19 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } /** - * sandbox() - Unshare IPC, mount, PID, UTS, and user namespaces, "unmount" root + * isolate_prefork() - Self isolation before daemonizing + * @c: Execution context * * Return: negative error code on failure, zero on success + * + * Should: + * - Move us to our own IPC and UTS namespaces + * - Move us to a mount namespace with only an empty directory + * - Drop unneeded capabilities (in the new user namespace) + * Mustn't: + * - Remove syscalls we need to daemonise */ -int sandbox(struct ctx *c) +int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; @@ -187,13 +256,19 @@ int sandbox(struct ctx *c) } /** - * seccomp() - Set up seccomp filters depending on mode, won't return on failure + * isolate_postfork() - Self isolation after daemonizing * @c: Execution context + * + * Should: + * - disable core dumps + * - limit to a minimal set of syscalls */ -void seccomp(const struct ctx *c) +void isolate_postfork(const struct ctx *c) { struct sock_fprog prog; + prctl(PR_SET_DUMPABLE, 0); + if (c->mode == MODE_PASST) { prog.len = (unsigned short)ARRAY_SIZE(filter_passt); prog.filter = filter_passt; diff --git a/isolation.h b/isolation.h index 2c73df7..70a38f9 100644 --- a/isolation.h +++ b/isolation.h @@ -7,9 +7,9 @@ #ifndef ISOLATION_H #define ISOLATION_H -void drop_caps(void); +void isolate_initial(void); void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns); -int sandbox(struct ctx *c); -void seccomp(const struct ctx *c); +int isolate_prefork(struct ctx *c); +void isolate_postfork(const struct ctx *c); #endif /* ISOLATION_H */ diff --git a/passt.c b/passt.c index 2c4a986..46f80a0 100644 --- a/passt.c +++ b/passt.c @@ -184,7 +184,7 @@ int main(int argc, char **argv) arch_avx2_exec(argv); - drop_caps(); + isolate_initial(); c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1; @@ -287,7 +287,7 @@ int main(int argc, char **argv) } } - if (sandbox(&c)) { + if (isolate_prefork(&c)) { err("Failed to sandbox process, exiting\n"); exit(EXIT_FAILURE); } @@ -297,9 +297,7 @@ int main(int argc, char **argv) else write_pidfile(pidfile_fd, getpid()); - prctl(PR_SET_DUMPABLE, 0); - - seccomp(&c); + isolate_postfork(&c); timer_init(&c, &now); -- 2.37.3
In a few places we use the FWRITE() macro to open a file, replace it's contents with a given string and close it again. There's no real reason this needs to be a macro rather than just a function though. Turn it into a function 'write_file()' and make some ancillary cleanups while we're there: - Add a return code so the caller can handle giving a useful error message - Handle the case of short write()s (unlikely, but possible) - Add O_TRUNC, to make sure we replace the existing contents entirely Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 17 +++++++++-------- pasta.c | 4 ++-- util.c | 33 +++++++++++++++++++++++++++++++++ util.h | 13 +------------ 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/isolation.c b/isolation.c index 6ba5830..fda9cad 100644 --- a/isolation.c +++ b/isolation.c @@ -129,7 +129,8 @@ void isolate_initial(void) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { - char nsmap[BUFSIZ]; + char uidmap[BUFSIZ]; + char gidmap[BUFSIZ]; /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { @@ -184,14 +185,14 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } /* Configure user and group mappings */ - snprintf(nsmap, BUFSIZ, "0 %u 1", uid); - FWRITE("/proc/self/uid_map", nsmap, "Cannot set uid_map in namespace"); + snprintf(uidmap, BUFSIZ, "0 %u 1", uid); + snprintf(gidmap, BUFSIZ, "0 %u 1", gid); - FWRITE("/proc/self/setgroups", "deny", - "Cannot write to setgroups in namespace"); - - snprintf(nsmap, BUFSIZ, "0 %u 1", gid); - FWRITE("/proc/self/gid_map", nsmap, "Cannot set gid_map in namespace"); + if (write_file("/proc/self/uid_map", uidmap) || + write_file("/proc/self/setgroups", "deny") || + write_file("/proc/self/gid_map", gidmap)) { + warn("Couldn't configure user namespace"); + } } /** diff --git a/pasta.c b/pasta.c index c72efe5..6314a29 100644 --- a/pasta.c +++ b/pasta.c @@ -166,8 +166,8 @@ static int pasta_setup_ns(void *arg) { const struct pasta_setup_ns_arg *a; - FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", - "Cannot set ping_group_range, ICMP requests might fail"); + if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) + warn("Cannot set ping_group_range, ICMP requests might fail"); a = (const struct pasta_setup_ns_arg *)arg; execvp(a->exe, a->argv); diff --git a/util.c b/util.c index 6b86ead..884b1f4 100644 --- a/util.c +++ b/util.c @@ -547,3 +547,36 @@ int fls(unsigned long x) return y; } + +/** + * write_file() - Replace contents of file with a string + * @path: File to write + * @buf: String to write + * + * Return: 0 on success, -1 on any error + */ +int write_file(const char *path, const char *buf) +{ + int fd = open(path, O_WRONLY | O_TRUNC | O_CLOEXEC); + size_t len = strlen(buf); + + if (fd < 0) { + warn("Could not open %s: %s", path, strerror(errno)); + return -1; + } + + while (len) { + ssize_t rc = write(fd, buf, len); + + if (rc <= 0) { + warn("Couldn't write to %s: %s", path, strerror(errno)); + break; + } + + buf += rc; + len -= rc; + } + + close(fd); + return len == 0 ? 0 : -1; +} diff --git a/util.h b/util.h index 0c06e34..f957522 100644 --- a/util.h +++ b/util.h @@ -58,18 +58,6 @@ void trace_init(int enable); #define TMPDIR "/tmp" #endif -#define FWRITE(path, buf, str) \ - do { \ - int flags = O_WRONLY | O_CLOEXEC; \ - int fd = open(path, flags); \ - \ - if (fd < 0 || \ - write(fd, buf, strlen(buf)) != (int)strlen(buf)) \ - warn(str); \ - if (fd >= 0) \ - close(fd); \ - } while (0) - #define V4 0 #define V6 1 #define IP_VERSIONS 2 @@ -215,5 +203,6 @@ int ns_enter(const struct ctx *c); void write_pidfile(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); +int write_file(const char *path, const char *buf); #endif /* UTIL_H */ -- 2.37.3
Currently, isolate_user() exits early if the --netns-only option is given. That works for now, but shortly we're going to want to add some logic to go at the end of isolate_user() that needs to run in all cases: joining a given userns, creating a new userns, or staying in our original userns (--netns-only). To avoid muddying those changes, here we reorganize isolate_user() to have a common exit path for all cases. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/isolation.c b/isolation.c index fda9cad..211c26f 100644 --- a/isolation.c +++ b/isolation.c @@ -129,9 +129,6 @@ void isolate_initial(void) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { - char uidmap[BUFSIZ]; - char gidmap[BUFSIZ]; - /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { /* If we don't have CAP_SETGID, this will EPERM */ @@ -152,12 +149,7 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) exit(EXIT_FAILURE); } - /* If we're told not to use a userns, nothing more to do */ - if (!use_userns) - return; - - /* Otherwise, if given a userns, join it */ - if (*userns) { + if (*userns) { /* If given a userns, join it */ int ufd; ufd = open(userns, O_RDONLY | O_CLOEXEC); @@ -174,24 +166,24 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } close(ufd); + } else if (use_userns) { /* Create and join a new userns */ + char uidmap[BUFSIZ]; + char gidmap[BUFSIZ]; - return; - } - - /* Otherwise, create our own userns */ - if (unshare(CLONE_NEWUSER) != 0) { - err("Couldn't create user namespace: %s", strerror(errno)); - exit(EXIT_FAILURE); - } + if (unshare(CLONE_NEWUSER) != 0) { + err("Couldn't create user namespace: %s", strerror(errno)); + exit(EXIT_FAILURE); + } - /* Configure user and group mappings */ - snprintf(uidmap, BUFSIZ, "0 %u 1", uid); - snprintf(gidmap, BUFSIZ, "0 %u 1", gid); + /* Configure user and group mappings */ + snprintf(uidmap, BUFSIZ, "0 %u 1", uid); + snprintf(gidmap, BUFSIZ, "0 %u 1", gid); - if (write_file("/proc/self/uid_map", uidmap) || - write_file("/proc/self/setgroups", "deny") || - write_file("/proc/self/gid_map", gidmap)) { - warn("Couldn't configure user namespace"); + if (write_file("/proc/self/uid_map", uidmap) || + write_file("/proc/self/setgroups", "deny") || + write_file("/proc/self/gid_map", gidmap)) { + warn("Couldn't configure user namespace"); + } } } -- 2.37.3
The current implementation of drop_caps() doesn't really work because it attempts to drop capabilities from the bounding set. That's not the set that really matters, it's about limiting the abilities of things we might later exec() rather than our own capabilities. It also requires CAP_SETPCAP which we won't usually have. Replace it with a new version which uses setcap(2) to drop capabilities from the effective and permitted sets. For now we leave the inheritable set as is, since we don't want to preclude the user from passing inheritable capabilities to the command spawed by pasta. Correctly dropping caps reveals that we were relying on some capabilities we'd supposedly dropped. Re-divide the dropping of capabilities between isolate_initial(), isolate_user() and isolate_prefork() to make this work. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- conf.c | 2 +- isolation.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++----- isolation.h | 3 +- 3 files changed, 92 insertions(+), 11 deletions(-) diff --git a/conf.c b/conf.c index 1537dbf..0be887e 100644 --- a/conf.c +++ b/conf.c @@ -1469,7 +1469,7 @@ void conf(struct ctx *c, int argc, char **argv) usage(argv[0]); } - isolate_user(uid, gid, !netns_only, userns); + isolate_user(uid, gid, !netns_only, userns, c->mode); if (c->pasta_conf_ns) c->no_ra = 1; diff --git a/isolation.c b/isolation.c index 211c26f..6d87dec 100644 --- a/isolation.c +++ b/isolation.c @@ -86,18 +86,37 @@ #include "passt.h" #include "isolation.h" +#define CAP_VERSION _LINUX_CAPABILITY_VERSION_3 +#define CAP_WORDS _LINUX_CAPABILITY_U32S_3 + /** - * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE + * drop_caps_ep_except() - Drop capabilities from effective & permitted sets + * @keep: Capabilities to keep */ -static void drop_caps(void) +static void drop_caps_ep_except(uint64_t keep) { + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + struct __user_cap_data_struct data[CAP_WORDS]; int i; - for (i = 0; i < 64; i++) { - if (i == CAP_NET_BIND_SERVICE) - continue; + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } - prctl(PR_CAPBSET_DROP, i, 0, 0, 0); + for (i = 0; i < CAP_WORDS; i++) { + uint32_t mask = keep >> (32 * i); + + data[i].effective &= mask; + data[i].permitted &= mask; + } + + if (syscall(SYS_capset, &hdr, data)) { + err("Couldn't drop capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); } } @@ -111,7 +130,25 @@ static void drop_caps(void) */ void isolate_initial(void) { - drop_caps(); + /* We want to keep CAP_NET_BIND_SERVICE in the initial + * namespace if we have it, so that we can forward low ports + * into the guest/namespace + * + * We have to keep CAP_SETUID and CAP_SETGID at this stage, so + * that we can switch user away from root. + * + * We have to keep some capabilities for the --netns-only case: + * - CAP_SYS_ADMIN, so that we can setns() to the netns. + * - Keep CAP_NET_ADMIN, so that we can configure interfaces + * + * It's debatable whether it's useful to drop caps when we + * retain SETUID and SYS_ADMIN, but we might as well. We drop + * further capabilites in isolate_user() and + * isolate_prefork(). + */ + drop_caps_ep_except(BIT(CAP_NET_BIND_SERVICE) | + BIT(CAP_SETUID) | BIT(CAP_SETGID) | + BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN)); } /** @@ -120,6 +157,7 @@ void isolate_initial(void) * @gid: Group ID to run as (in original userns) * @use_userns: Whether to join or create a userns * @userns: userns path to enter, may be empty + * @mode: Mode (passt or pasta) * * Should: * - set our final UID and GID @@ -127,8 +165,11 @@ void isolate_initial(void) * Mustn't: * - remove filesystem access (we need that for further setup) */ -void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) +void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns, + enum passt_modes mode) { + uint64_t ns_caps = 0; + /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { /* If we don't have CAP_SETGID, this will EPERM */ @@ -166,6 +207,7 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } close(ufd); + } else if (use_userns) { /* Create and join a new userns */ char uidmap[BUFSIZ]; char gidmap[BUFSIZ]; @@ -185,6 +227,31 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) warn("Couldn't configure user namespace"); } } + + /* Joining a new userns gives us full capabilities; drop the + * ones we don't need. With --netns-only we haven't changed + * userns but we can drop more capabilities now than at + * isolate_initial() + */ + /* Keep CAP_SYS_ADMIN, so we can unshare() further in + * isolate_prefork(), pasta also needs it to setns() into the + * netns + */ + ns_caps |= BIT(CAP_SYS_ADMIN); + if (mode == MODE_PASTA) { + /* Keep CAP_NET_ADMIN, so we can configure the if */ + ns_caps |= BIT(CAP_NET_ADMIN); + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= BIT(CAP_NET_BIND_SERVICE); + /* Keep CAP_SYS_PTRACE to join the netns of an + * existing process */ + if (*userns || !use_userns) + ns_caps |= BIT(CAP_SYS_PTRACE); + } + + drop_caps_ep_except(ns_caps); } /** @@ -203,6 +270,7 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; + uint64_t ns_caps = 0; /* If we run in foreground, we have no chance to actually move to a new * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody @@ -243,7 +311,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Now that initialization is more-or-less complete, we can + * drop further capabilities + */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so we can enter the netns */ + ns_caps |= BIT(CAP_SYS_ADMIN); + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= BIT(CAP_NET_BIND_SERVICE); + } + + drop_caps_ep_except(ns_caps); return 0; } diff --git a/isolation.h b/isolation.h index 70a38f9..54c60f6 100644 --- a/isolation.h +++ b/isolation.h @@ -8,7 +8,8 @@ #define ISOLATION_H void isolate_initial(void); -void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns); +void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns, + enum passt_modes mode); int isolate_prefork(struct ctx *c); void isolate_postfork(const struct ctx *c); -- 2.37.3
We drop our own capabilities, but it's possible that processes we exec() could gain extra privilege via file capabilities. It shouldn't be possible for us to exec() anyway due to seccomp() and our filesystem isolation. But just in case, zero the bounding and inheritable capability sets to prevent any such child from gainin privilege. Note that we do this *after* spawning the pasta shell/command (if any), because we do want the user to be able to give that privilege if they want. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/isolation.c b/isolation.c index 6d87dec..85a5b62 100644 --- a/isolation.c +++ b/isolation.c @@ -120,6 +120,61 @@ static void drop_caps_ep_except(uint64_t keep) } } +/** + * clamp_caps() - Prevent any children from gaining caps + * + * This drops all capabilities from both the inheritable and the + * bounding set. This means that any exec()ed processes can't gain + * capabilities, even if they have file capabilities which would grant + * them. We shouldn't ever exec() in any case, but this provides an + * additional layer of protection. Executing this requires + * CAP_SETPCAP, which we will have within our userns. + * + * Note that dropping capabilites from the bounding set limits + * exec()ed processes, but does not remove them from the effective or + * permitted sets, so it doesn't reduce our own capabilities. + */ +static void clamp_caps(void) +{ + struct __user_cap_data_struct data[CAP_WORDS]; + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + int i; + + for (i = 0; i < 64; i++) { + /* Some errors can be ignored: + * - EINVAL, we'll get this for all values in 0..63 + * that are not actually allocated caps + * - EPERM, we'll get this if we don't have + * CAP_SETPCAP, which can happen if using + * --netns-only. We don't need CAP_SETPCAP for + * normal operation, so carry on without it. + */ + if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) && + errno != EINVAL && errno != EPERM) { + err("Couldn't drop cap %i from bounding set: %s", + i, strerror(errno)); + exit(EXIT_FAILURE); + } + } + + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + for (i = 0; i < CAP_WORDS; i++) + data[i].inheritable = 0; + + if (syscall(SYS_capset, &hdr, data)) { + err("Couldn't drop inheritable capabilities: %s", + strerror(errno)); + exit(EXIT_FAILURE); + } +} + /** * isolate_initial() - Early, config independent self isolation * @@ -323,6 +378,7 @@ int isolate_prefork(struct ctx *c) ns_caps |= BIT(CAP_NET_BIND_SERVICE); } + clamp_caps(); drop_caps_ep_except(ns_caps); return 0; -- 2.37.3
When in passt mode, or pasta mode spawning a command, we create a userns for ourselves. This is used both to isolate the pasta/passt process itself and to run the spawned command, if any. Since eed17a47 "Handle userns isolation and dropping root at the same time" we've handled both cases the same, configuring the UID and GID mappings in the new userns to map whichever UID we're running as to root within the userns. This mapping is desirable when spawning a shell or other command, so that the user gets a root shell with reasonably clear abilities within the userns and netns. It's not necessarily essential, though. When not spawning a shell, it doesn't really have any purpose: passt itself doesn't need to be root and can operate fine with an unmapped user (using some of the capabilities we get when entering the userns instead). Configuring the uid_map can cause problems if passt is running with any capabilities in the initial namespace, such as CAP_NET_BIND_SERVICE to allow it to forward low ports. In this case the kernel makes files in /proc/pid owned by root rather than the starting user to prevent the user from interfering with the operation of the capability-enhanced process. This includes uid_map meaning we are not able to write to it. Whether this behaviour is correct in the kernel is debatable, but in any case we might as well avoid problems by only initializing the user mappings when we really want them. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- conf.c | 3 ++- isolation.c | 13 ------------- pasta.c | 15 ++++++++++++++- pasta.h | 3 ++- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/conf.c b/conf.c index 0be887e..601141c 100644 --- a/conf.c +++ b/conf.c @@ -1478,7 +1478,8 @@ void conf(struct ctx *c, int argc, char **argv) if (*netns) { pasta_open_ns(c, netns); } else { - pasta_start_ns(c, argc - optind, argv + optind); + pasta_start_ns(c, uid, gid, + argc - optind, argv + optind); } } diff --git a/isolation.c b/isolation.c index 85a5b62..2656085 100644 --- a/isolation.c +++ b/isolation.c @@ -264,23 +264,10 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns, close(ufd); } else if (use_userns) { /* Create and join a new userns */ - char uidmap[BUFSIZ]; - char gidmap[BUFSIZ]; - if (unshare(CLONE_NEWUSER) != 0) { err("Couldn't create user namespace: %s", strerror(errno)); exit(EXIT_FAILURE); } - - /* Configure user and group mappings */ - snprintf(uidmap, BUFSIZ, "0 %u 1", uid); - snprintf(gidmap, BUFSIZ, "0 %u 1", gid); - - if (write_file("/proc/self/uid_map", uidmap) || - write_file("/proc/self/setgroups", "deny") || - write_file("/proc/self/gid_map", gidmap)) { - warn("Couldn't configure user namespace"); - } } /* Joining a new userns gives us full capabilities; drop the diff --git a/pasta.c b/pasta.c index 6314a29..d165602 100644 --- a/pasta.c +++ b/pasta.c @@ -179,15 +179,19 @@ static int pasta_setup_ns(void *arg) /** * pasta_start_ns() - Fork command in new namespace if target ns is not given * @c: Execution context + * @uid: UID we're running as in the init namespace + * @gid: GID we're running as in the init namespace * @argc: Number of arguments for spawned command * @argv: Command to spawn and arguments */ -void pasta_start_ns(struct ctx *c, int argc, char *argv[]) +void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, + int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { .exe = argv[0], .argv = argv, }; + char uidmap[BUFSIZ], gidmap[BUFSIZ]; char ns_fn_stack[NS_FN_STACK_SIZE]; char *sh_argv[] = { NULL, NULL }; char sh_arg0[PATH_MAX + 1]; @@ -196,6 +200,15 @@ void pasta_start_ns(struct ctx *c, int argc, char *argv[]) if (!c->debug) c->quiet = 1; + /* Configure user and group mappings */ + snprintf(uidmap, BUFSIZ, "0 %u 1", uid); + snprintf(gidmap, BUFSIZ, "0 %u 1", gid); + + if (write_file("/proc/self/uid_map", uidmap) || + write_file("/proc/self/setgroups", "deny") || + write_file("/proc/self/gid_map", gidmap)) { + warn("Couldn't configure user mappings"); + } if (argc == 0) { arg.exe = getenv("SHELL"); diff --git a/pasta.h b/pasta.h index 02df1f6..a8b9893 100644 --- a/pasta.h +++ b/pasta.h @@ -7,7 +7,8 @@ #define PASTA_H void pasta_open_ns(struct ctx *c, const char *netns); -void pasta_start_ns(struct ctx *c, int argc, char *argv[]); +void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, + int argc, char *argv[]); void pasta_ns_conf(struct ctx *c); void pasta_child_handler(int signal); int pasta_netns_quit_init(struct ctx *c); -- 2.37.3
pasta_setup_ns() no longer has much to do with setting up a namespace. Instead it's really about starting the shell or other command we want to run with pasta connectivity. Rename it and its argument structure to be less misleading. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pasta.c b/pasta.c index d165602..27df899 100644 --- a/pasta.c +++ b/pasta.c @@ -147,29 +147,29 @@ void pasta_open_ns(struct ctx *c, const char *netns) } /** - * struct pasta_setup_ns_arg - Argument for pasta_setup_ns() + * struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd() * @exe: Executable to run * @argv: Command and arguments to run */ -struct pasta_setup_ns_arg { +struct pasta_spawn_cmd_arg { const char *exe; char *const *argv; }; /** - * pasta_setup_ns() - Map credentials, enable access to ping sockets, run shell - * @arg: See @pasta_setup_ns_arg + * pasta_spawn_cmd() - Prepare new netns, start command or shell + * @arg: See @pasta_spawn_cmd_arg * * Return: this function never returns */ -static int pasta_setup_ns(void *arg) +static int pasta_spawn_cmd(void *arg) { - const struct pasta_setup_ns_arg *a; + const struct pasta_spawn_cmd_arg *a; if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) warn("Cannot set ping_group_range, ICMP requests might fail"); - a = (const struct pasta_setup_ns_arg *)arg; + a = (const struct pasta_spawn_cmd_arg *)arg; execvp(a->exe, a->argv); perror("execvp"); @@ -187,7 +187,7 @@ static int pasta_setup_ns(void *arg) void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, int argc, char *argv[]) { - struct pasta_setup_ns_arg arg = { + struct pasta_spawn_cmd_arg arg = { .exe = argv[0], .argv = argv, }; @@ -225,7 +225,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, arg.argv = sh_argv; } - pasta_child_pid = clone(pasta_setup_ns, + pasta_child_pid = clone(pasta_spawn_cmd, ns_fn_stack + sizeof(ns_fn_stack) / 2, CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWUTS, -- 2.37.3