Our current handling of capabilities isn't quite right. In particular, drop_caps() attempts to remove capabilities from the bounding set, which usually won't work, and even if it does won't have the effect we want. This series corrects that, as well as making some other fixes and cleanups in adjacent code. David Gibson (10): test: Move slower tests to end of test run pasta: More general way of starting spawned shell as a login shell pasta_start_ns() always ends in parent context Remove unhelpful drop_caps() call in pasta_start_ns() Clarify various self-isolation steps Replace FWRITE with a function isolation: Replace drop_caps() with a version that actually does something isolation: Prevent any child processes gaining capabilities isolation: Only configure UID/GID mappings in userns when spawning shell Rename pasta_setup_ns() to pasta_spawn_cmd() conf.c | 3 +- isolation.c | 199 ++++++++++++++++++++++++++++++++++++++++++++++------ isolation.h | 6 +- passt.c | 8 +-- pasta.c | 72 +++++++++++-------- pasta.h | 3 +- test/run | 20 +++--- util.c | 33 +++++++++ util.h | 13 +--- 9 files changed, 275 insertions(+), 82 deletions(-) -- 2.37.3
The distro and performance tests are by far the slowest part of the passt testsuite. Move them to the end of the testsuite run, so that it's easier to do a quick test during development by letting the other tests run then interrupting the test runner. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- test/run | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/run b/test/run index 4bb9cd8..d1f24ef 100755 --- a/test/run +++ b/test/run @@ -65,13 +65,6 @@ run() { test build/clang_tidy teardown build - setup distro - test distro/debian - test distro/fedora - test distro/opensuse - test distro/ubuntu - teardown distro - setup pasta test pasta/ndp test pasta/dhcp @@ -98,6 +91,10 @@ run() { test passt_in_ns/shutdown teardown passt_in_ns + setup two_guests + test two_guests/basic + teardown two_guests + VALGRIND=0 setup passt_in_ns test passt/ndp @@ -109,9 +106,12 @@ run() { test passt_in_ns/shutdown teardown passt_in_ns - setup two_guests - test two_guests/basic - teardown two_guests + setup distro + test distro/debian + test distro/fedora + test distro/opensuse + test distro/ubuntu + teardown distro perf_finish [ ${CI} -eq 1 ] && video_stop -- 2.37.3
When invoked so as to spawn a shell, pasta checks explicitly for the shell being bash and if so, adds a "-l" option to make it a login shell. This is not ideal, since this is a bash specific option and requires pasta to know about specific shell variants. There's a general convention for starting a login shell, which is to prepend a "-" to argv[0]. Use this approach instead, so we don't need bash specific logic. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pasta.c b/pasta.c index 1dd8267..7c3acef 100644 --- a/pasta.c +++ b/pasta.c @@ -148,10 +148,12 @@ void pasta_open_ns(struct ctx *c, const char *netns) /** * struct pasta_setup_ns_arg - Argument for pasta_setup_ns() + * @exe: Executable to run * @argv: Command and arguments to run */ struct pasta_setup_ns_arg { - char **argv; + const char *exe; + char *const *argv; }; /** @@ -162,12 +164,13 @@ struct pasta_setup_ns_arg { */ static int pasta_setup_ns(void *arg) { - struct pasta_setup_ns_arg *a = (struct pasta_setup_ns_arg *)arg; + const struct pasta_setup_ns_arg *a + = (const struct pasta_setup_ns_arg *)arg; FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", "Cannot set ping_group_range, ICMP requests might fail"); - execvp(a->argv[0], a->argv); + execvp(a->exe, a->argv); perror("execvp"); exit(EXIT_FAILURE); @@ -182,26 +185,31 @@ static int pasta_setup_ns(void *arg) void pasta_start_ns(struct ctx *c, int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { + .exe = argv[0], .argv = argv, }; - char *shell = getenv("SHELL"); - char *sh_argv[] = { shell, NULL }; - char *bash_argv[] = { shell, "-l", NULL }; + char *sh_argv[] = { NULL, NULL }; char ns_fn_stack[NS_FN_STACK_SIZE]; + char sh_arg0[PATH_MAX + 1]; c->foreground = 1; if (!c->debug) c->quiet = 1; - if (!shell) - shell = "/bin/sh"; if (argc == 0) { - if (strstr(shell, "/bash")) { - arg.argv = bash_argv; - } else { - arg.argv = sh_argv; + arg.exe = getenv("SHELL"); + if (!arg.exe) + arg.exe = "/bin/sh"; + + if ((size_t)snprintf(sh_arg0, sizeof(sh_arg0), + "-%s", arg.exe) >= sizeof(sh_arg0)) { + err("$SHELL is too long (%u bytes)", + strlen(arg.exe)); + exit(EXIT_FAILURE); } + sh_argv[0] = sh_arg0; + arg.argv = sh_argv; } pasta_child_pid = clone(pasta_setup_ns, -- 2.37.3
Just nits here: On Tue, 11 Oct 2022 16:40:10 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:When invoked so as to spawn a shell, pasta checks explicitly for the shell being bash and if so, adds a "-l" option to make it a login shell. This is not ideal, since this is a bash specific option and requires pasta to know about specific shell variants. There's a general convention for starting a login shell, which is to prepend a "-" to argv[0]. Use this approach instead, so we don't need bash specific logic.Hah, I didn't know that was the meaning.Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pasta.c b/pasta.c index 1dd8267..7c3acef 100644 --- a/pasta.c +++ b/pasta.c @@ -148,10 +148,12 @@ void pasta_open_ns(struct ctx *c, const char *netns) /** * struct pasta_setup_ns_arg - Argument for pasta_setup_ns() + * @exe: Executable to run * @argv: Command and arguments to run */ struct pasta_setup_ns_arg { - char **argv; + const char *exe; + char *const *argv; }; /** @@ -162,12 +164,13 @@ struct pasta_setup_ns_arg { */ static int pasta_setup_ns(void *arg) { - struct pasta_setup_ns_arg *a = (struct pasta_setup_ns_arg *)arg; + const struct pasta_setup_ns_arg *a + = (const struct pasta_setup_ns_arg *)arg;At this point the assignment could be split onto another line.FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", "Cannot set ping_group_range, ICMP requests might fail"); - execvp(a->argv[0], a->argv); + execvp(a->exe, a->argv); perror("execvp"); exit(EXIT_FAILURE); @@ -182,26 +185,31 @@ static int pasta_setup_ns(void *arg) void pasta_start_ns(struct ctx *c, int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { + .exe = argv[0], .argv = argv, }; - char *shell = getenv("SHELL"); - char *sh_argv[] = { shell, NULL }; - char *bash_argv[] = { shell, "-l", NULL }; + char *sh_argv[] = { NULL, NULL }; char ns_fn_stack[NS_FN_STACK_SIZE];If you respin, it would be nice to have the usual ordering here (sh_argv[] after ns_fn_stack).+ char sh_arg0[PATH_MAX + 1]; c->foreground = 1; if (!c->debug) c->quiet = 1; - if (!shell) - shell = "/bin/sh"; if (argc == 0) { - if (strstr(shell, "/bash")) { - arg.argv = bash_argv; - } else { - arg.argv = sh_argv; + arg.exe = getenv("SHELL"); + if (!arg.exe) + arg.exe = "/bin/sh"; + + if ((size_t)snprintf(sh_arg0, sizeof(sh_arg0), + "-%s", arg.exe) >= sizeof(sh_arg0)) {This is completely specified and looks safe, but it also looks more complicated than it actually is, at a glance. Maybe a separate length check before snprintf() would make it look more natural. Not a strong preference though.+ err("$SHELL is too long (%u bytes)", + strlen(arg.exe)); + exit(EXIT_FAILURE); } + sh_argv[0] = sh_arg0; + arg.argv = sh_argv; } pasta_child_pid = clone(pasta_setup_ns,-- Stefano
On Thu, Oct 13, 2022 at 04:16:59AM +0200, Stefano Brivio wrote:Just nits here: On Tue, 11 Oct 2022 16:40:10 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Uh.. I'm not sure what you mean by that.When invoked so as to spawn a shell, pasta checks explicitly for the shell being bash and if so, adds a "-l" option to make it a login shell. This is not ideal, since this is a bash specific option and requires pasta to know about specific shell variants. There's a general convention for starting a login shell, which is to prepend a "-" to argv[0]. Use this approach instead, so we don't need bash specific logic.Hah, I didn't know that was the meaning.Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pasta.c b/pasta.c index 1dd8267..7c3acef 100644 --- a/pasta.c +++ b/pasta.c @@ -148,10 +148,12 @@ void pasta_open_ns(struct ctx *c, const char *netns) /** * struct pasta_setup_ns_arg - Argument for pasta_setup_ns() + * @exe: Executable to run * @argv: Command and arguments to run */ struct pasta_setup_ns_arg { - char **argv; + const char *exe; + char *const *argv; }; /** @@ -162,12 +164,13 @@ struct pasta_setup_ns_arg { */ static int pasta_setup_ns(void *arg) { - struct pasta_setup_ns_arg *a = (struct pasta_setup_ns_arg *)arg; + const struct pasta_setup_ns_arg *a + = (const struct pasta_setup_ns_arg *)arg;At this point the assignment could be split onto another line.Done.FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", "Cannot set ping_group_range, ICMP requests might fail"); - execvp(a->argv[0], a->argv); + execvp(a->exe, a->argv); perror("execvp"); exit(EXIT_FAILURE); @@ -182,26 +185,31 @@ static int pasta_setup_ns(void *arg) void pasta_start_ns(struct ctx *c, int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { + .exe = argv[0], .argv = argv, }; - char *shell = getenv("SHELL"); - char *sh_argv[] = { shell, NULL }; - char *bash_argv[] = { shell, "-l", NULL }; + char *sh_argv[] = { NULL, NULL }; char ns_fn_stack[NS_FN_STACK_SIZE];If you respin, it would be nice to have the usual ordering here (sh_argv[] after ns_fn_stack).Uh.. not sure what you mean by that either.+ char sh_arg0[PATH_MAX + 1]; c->foreground = 1; if (!c->debug) c->quiet = 1; - if (!shell) - shell = "/bin/sh"; if (argc == 0) { - if (strstr(shell, "/bash")) { - arg.argv = bash_argv; - } else { - arg.argv = sh_argv; + arg.exe = getenv("SHELL"); + if (!arg.exe) + arg.exe = "/bin/sh"; + + if ((size_t)snprintf(sh_arg0, sizeof(sh_arg0), + "-%s", arg.exe) >= sizeof(sh_arg0)) {This is completely specified and looks safe, but it also looks more complicated than it actually is, at a glance. Maybe a separate length check before snprintf() would make it look more natural. Not a strong preference though.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson+ err("$SHELL is too long (%u bytes)", + strlen(arg.exe)); + exit(EXIT_FAILURE); } + sh_argv[0] = sh_arg0; + arg.argv = sh_argv; } pasta_child_pid = clone(pasta_setup_ns,
On Thu, 13 Oct 2022 19:22:27 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Thu, Oct 13, 2022 at 04:16:59AM +0200, Stefano Brivio wrote:const struct pasta_setup_ns_arg *a; a = (const struct pasta_setup_ns_arg *)arg;Just nits here: On Tue, 11 Oct 2022 16:40:10 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Uh.. I'm not sure what you mean by that.When invoked so as to spawn a shell, pasta checks explicitly for the shell being bash and if so, adds a "-l" option to make it a login shell. This is not ideal, since this is a bash specific option and requires pasta to know about specific shell variants. There's a general convention for starting a login shell, which is to prepend a "-" to argv[0]. Use this approach instead, so we don't need bash specific logic.Hah, I didn't know that was the meaning.Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pasta.c b/pasta.c index 1dd8267..7c3acef 100644 --- a/pasta.c +++ b/pasta.c @@ -148,10 +148,12 @@ void pasta_open_ns(struct ctx *c, const char *netns) /** * struct pasta_setup_ns_arg - Argument for pasta_setup_ns() + * @exe: Executable to run * @argv: Command and arguments to run */ struct pasta_setup_ns_arg { - char **argv; + const char *exe; + char *const *argv; }; /** @@ -162,12 +164,13 @@ struct pasta_setup_ns_arg { */ static int pasta_setup_ns(void *arg) { - struct pasta_setup_ns_arg *a = (struct pasta_setup_ns_arg *)arg; + const struct pasta_setup_ns_arg *a + = (const struct pasta_setup_ns_arg *)arg;At this point the assignment could be split onto another line.Ah, sorry. if ((len = strlen(arg.exe)) + 1 /* - */ + 1 /* \0 */ > sizeof(sh_arg0)) err("$SHELL is too long (%u bytes)", strlen(arg.exe)); (void)snprintf(sh_arg0, sizeof(sh_arg0), "-%s", arg.exe); I don't know, it doesn't look so... cramped. Maybe it's just me. If it doesn't make sense just disregard ;) -- StefanoDone.FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", "Cannot set ping_group_range, ICMP requests might fail"); - execvp(a->argv[0], a->argv); + execvp(a->exe, a->argv); perror("execvp"); exit(EXIT_FAILURE); @@ -182,26 +185,31 @@ static int pasta_setup_ns(void *arg) void pasta_start_ns(struct ctx *c, int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { + .exe = argv[0], .argv = argv, }; - char *shell = getenv("SHELL"); - char *sh_argv[] = { shell, NULL }; - char *bash_argv[] = { shell, "-l", NULL }; + char *sh_argv[] = { NULL, NULL }; char ns_fn_stack[NS_FN_STACK_SIZE];If you respin, it would be nice to have the usual ordering here (sh_argv[] after ns_fn_stack).Uh.. not sure what you mean by that either.+ char sh_arg0[PATH_MAX + 1]; c->foreground = 1; if (!c->debug) c->quiet = 1; - if (!shell) - shell = "/bin/sh"; if (argc == 0) { - if (strstr(shell, "/bash")) { - arg.argv = bash_argv; - } else { - arg.argv = sh_argv; + arg.exe = getenv("SHELL"); + if (!arg.exe) + arg.exe = "/bin/sh"; + + if ((size_t)snprintf(sh_arg0, sizeof(sh_arg0), + "-%s", arg.exe) >= sizeof(sh_arg0)) {This is completely specified and looks safe, but it also looks more complicated than it actually is, at a glance. Maybe a separate length check before snprintf() would make it look more natural. Not a strong preference though.
On Thu, Oct 13, 2022 at 11:48:18AM +0200, Stefano Brivio wrote:On Thu, 13 Oct 2022 19:22:27 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Ah, right, that makes sense. Adjusted.On Thu, Oct 13, 2022 at 04:16:59AM +0200, Stefano Brivio wrote:const struct pasta_setup_ns_arg *a; a = (const struct pasta_setup_ns_arg *)arg;Just nits here: On Tue, 11 Oct 2022 16:40:10 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Uh.. I'm not sure what you mean by that.When invoked so as to spawn a shell, pasta checks explicitly for the shell being bash and if so, adds a "-l" option to make it a login shell. This is not ideal, since this is a bash specific option and requires pasta to know about specific shell variants. There's a general convention for starting a login shell, which is to prepend a "-" to argv[0]. Use this approach instead, so we don't need bash specific logic.Hah, I didn't know that was the meaning.Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/pasta.c b/pasta.c index 1dd8267..7c3acef 100644 --- a/pasta.c +++ b/pasta.c @@ -148,10 +148,12 @@ void pasta_open_ns(struct ctx *c, const char *netns) /** * struct pasta_setup_ns_arg - Argument for pasta_setup_ns() + * @exe: Executable to run * @argv: Command and arguments to run */ struct pasta_setup_ns_arg { - char **argv; + const char *exe; + char *const *argv; }; /** @@ -162,12 +164,13 @@ struct pasta_setup_ns_arg { */ static int pasta_setup_ns(void *arg) { - struct pasta_setup_ns_arg *a = (struct pasta_setup_ns_arg *)arg; + const struct pasta_setup_ns_arg *a + = (const struct pasta_setup_ns_arg *)arg;At this point the assignment could be split onto another line.I see what you mean about the crampedness. But the proposed alternative seems more complicated in other ways to me. I think I'll leave it as is for now. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonAh, sorry. if ((len = strlen(arg.exe)) + 1 /* - */ + 1 /* \0 */ > sizeof(sh_arg0)) err("$SHELL is too long (%u bytes)", strlen(arg.exe)); (void)snprintf(sh_arg0, sizeof(sh_arg0), "-%s", arg.exe); I don't know, it doesn't look so... cramped. Maybe it's just me. If it doesn't make sense just disregard ;)Done.FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", "Cannot set ping_group_range, ICMP requests might fail"); - execvp(a->argv[0], a->argv); + execvp(a->exe, a->argv); perror("execvp"); exit(EXIT_FAILURE); @@ -182,26 +185,31 @@ static int pasta_setup_ns(void *arg) void pasta_start_ns(struct ctx *c, int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { + .exe = argv[0], .argv = argv, }; - char *shell = getenv("SHELL"); - char *sh_argv[] = { shell, NULL }; - char *bash_argv[] = { shell, "-l", NULL }; + char *sh_argv[] = { NULL, NULL }; char ns_fn_stack[NS_FN_STACK_SIZE];If you respin, it would be nice to have the usual ordering here (sh_argv[] after ns_fn_stack).Uh.. not sure what you mean by that either.+ char sh_arg0[PATH_MAX + 1]; c->foreground = 1; if (!c->debug) c->quiet = 1; - if (!shell) - shell = "/bin/sh"; if (argc == 0) { - if (strstr(shell, "/bash")) { - arg.argv = bash_argv; - } else { - arg.argv = sh_argv; + arg.exe = getenv("SHELL"); + if (!arg.exe) + arg.exe = "/bin/sh"; + + if ((size_t)snprintf(sh_arg0, sizeof(sh_arg0), + "-%s", arg.exe) >= sizeof(sh_arg0)) {This is completely specified and looks safe, but it also looks more complicated than it actually is, at a glance. Maybe a separate length check before snprintf() would make it look more natural. Not a strong preference though.
The end of pasta_start_ns() has a test against pasta_child_pid, testing if we're in the parent or the child. However we started the child running the pasta_setup_ns function which always exec()s or exit()s, so if we return from the clone() we are always in the parent, making that test unnecessary. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pasta.c b/pasta.c index 7c3acef..1ad63fb 100644 --- a/pasta.c +++ b/pasta.c @@ -225,10 +225,7 @@ void pasta_start_ns(struct ctx *c, int argc, char *argv[]) drop_caps(); - if (pasta_child_pid) { - NS_CALL(pasta_wait_for_ns, c); - return; - } + NS_CALL(pasta_wait_for_ns, c); } /** -- 2.37.3
drop_caps() has a number of bugs which mean it doesn't do what you'd expect. However, even if we fixed those, the call in pasta_start_ns() doesn't do anything useful: * In the common case, we're UID 0 at this point. In this case drop_caps() doesn't accomplish anything, because even with capabilities dropped, we are still privileged. * When attaching to an existing namespace with --userns or --netns-only we might not be UID 0. In this case it's too early to drop all capabilities: we need at least CAP_NET_ADMIN to configure the tap device in the namespace. Remove this call - we will still drop capabilities a little later in sandbox(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/pasta.c b/pasta.c index 1ad63fb..4ff322c 100644 --- a/pasta.c +++ b/pasta.c @@ -223,8 +223,6 @@ void pasta_start_ns(struct ctx *c, int argc, char *argv[]) exit(EXIT_FAILURE); } - drop_caps(); - NS_CALL(pasta_wait_for_ns, c); } -- 2.37.3
We have a number of steps of self-isolation scattered across our code. Improve function names and add comments to make it clearer what the self isolation model is, what the steps do, and why they happen at the points they happen. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++---- isolation.h | 6 ++-- passt.c | 8 ++--- 3 files changed, 86 insertions(+), 13 deletions(-) diff --git a/isolation.c b/isolation.c index 124dea4..10cef05 100644 --- a/isolation.c +++ b/isolation.c @@ -12,6 +12,48 @@ * Author: Stefano Brivio <sbrivio(a)redhat.com> * Author: David Gibson <david(a)gibson.dropbear.id.au> */ +/** + * DOC: Theory of Operation + * + * For security the passt/pasta process performs a number of + * self-isolations steps, dropping capabilities, setting namespaces + * and otherwise minimizing the impact we can have on the system at + * large if we were compromised. + * + * Obviously we can't isolate ourselves from resources before we've + * done anything we need to do with those resources, so we have + * multiple stages of self-isolation. In order these are: + * + * 1. isolate_initial() + * ==================== + * + * Executed immediately after startup, drops capabilities we don't + * need at any point during execution (or which we gain back when we + * need by joining other namespaces). + * + * 2. isolate_user() + * ================= + * + * Executed once we know what user and user namespace we want to + * operate in. Sets our final UID & GID, and enters the correct user + * namespace. + * + * 3. isolate_prefork() + * ==================== + * + * Executed after all setup, but before daemonizing (fork()ing into + * the background). Uses mount namespace and pivot_root() to remove + * our access to the filesystem(). + * + * 4. isolate_postfork() + * ===================== + * + * Executed immediately after daemonizing, but before entering the + * actual packet forwarding phase of operation. Or, if not + * daemonizing, immediately after isolate_prefork(). Uses seccomp() + * to restrict ourselves to the handful of syscalls we need during + * runtime operation. + */ #include <errno.h> #include <fcntl.h> @@ -47,7 +89,7 @@ /** * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE */ -void drop_caps(void) +static void drop_caps(void) { int i; @@ -59,12 +101,31 @@ void drop_caps(void) } } +/** + * isolate_initial() - Early, config independent self isolation + * + * Should: + * - drop unneeded capabilities + * Musn't: + * - remove filessytem access (we need to access files during setup) + */ +void isolate_initial(void) +{ + drop_caps(); +} + /** * isolate_user() - Switch to final UID/GID and move into userns * @uid: User ID to run as (in original userns) * @gid: Group ID to run as (in original userns) * @use_userns: Whether to join or create a userns * @userns: userns path to enter, may be empty + * + * Should: + * - set our final UID and GID + * - enter our final user namespace + * Mustn't: + * - remove filesystem access (we need that for further setup) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { @@ -134,11 +195,19 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } /** - * sandbox() - Unshare IPC, mount, PID, UTS, and user namespaces, "unmount" root + * isolate_prefork() - Self isolation before daemonizing + * @c: Execution context + * + * Should: + * - Moves us to our own IPC and UTS namespaces + * - Moves us to a mount namespace with only an empty directory + * - Drops unneeded capabilities (in the new user namespace) + * Mustn't: + * - Remove syscalls we need to daemonize * * Return: negative error code on failure, zero on success */ -int sandbox(struct ctx *c) +int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; @@ -187,13 +256,19 @@ int sandbox(struct ctx *c) } /** - * seccomp() - Set up seccomp filters depending on mode, won't return on failure + * isolate_postfork() - Self isolation after daemonizing * @c: Execution context + * + * Should: + * - disable core dumps + * - limit to a minimal set of syscalls */ -void seccomp(const struct ctx *c) +void isolate_postfork(const struct ctx *c) { struct sock_fprog prog; + prctl(PR_SET_DUMPABLE, 0); + if (c->mode == MODE_PASST) { prog.len = (unsigned short)ARRAY_SIZE(filter_passt); prog.filter = filter_passt; diff --git a/isolation.h b/isolation.h index 2c73df7..70a38f9 100644 --- a/isolation.h +++ b/isolation.h @@ -7,9 +7,9 @@ #ifndef ISOLATION_H #define ISOLATION_H -void drop_caps(void); +void isolate_initial(void); void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns); -int sandbox(struct ctx *c); -void seccomp(const struct ctx *c); +int isolate_prefork(struct ctx *c); +void isolate_postfork(const struct ctx *c); #endif /* ISOLATION_H */ diff --git a/passt.c b/passt.c index 2c4a986..46f80a0 100644 --- a/passt.c +++ b/passt.c @@ -184,7 +184,7 @@ int main(int argc, char **argv) arch_avx2_exec(argv); - drop_caps(); + isolate_initial(); c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1; @@ -287,7 +287,7 @@ int main(int argc, char **argv) } } - if (sandbox(&c)) { + if (isolate_prefork(&c)) { err("Failed to sandbox process, exiting\n"); exit(EXIT_FAILURE); } @@ -297,9 +297,7 @@ int main(int argc, char **argv) else write_pidfile(pidfile_fd, getpid()); - prctl(PR_SET_DUMPABLE, 0); - - seccomp(&c); + isolate_postfork(&c); timer_init(&c, &now); -- 2.37.3
I think this, in particular, is really a notable improvement. Same here, only nits: On Tue, 11 Oct 2022 16:40:13 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:We have a number of steps of self-isolation scattered across our code. Improve function names and add comments to make it clearer what the self isolation model is, what the steps do, and why they happen at the points they happen. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++---- isolation.h | 6 ++-- passt.c | 8 ++--- 3 files changed, 86 insertions(+), 13 deletions(-) diff --git a/isolation.c b/isolation.c index 124dea4..10cef05 100644 --- a/isolation.c +++ b/isolation.c @@ -12,6 +12,48 @@ * Author: Stefano Brivio <sbrivio(a)redhat.com> * Author: David Gibson <david(a)gibson.dropbear.id.au> */ +/** + * DOC: Theory of Operation + * + * For security the passt/pasta process performs a number of + * self-isolations steps, dropping capabilities, setting namespaces + * and otherwise minimizing the impact we can have on the system at + * large if we were compromised.I would try to be consistent with the BrE spelling we used everywhere else: minimising, daemonising.+ * + * Obviously we can't isolate ourselves from resources before we've + * done anything we need to do with those resources, so we have + * multiple stages of self-isolation. In order these are: + * + * 1. isolate_initial() + * ==================== + * + * Executed immediately after startup, drops capabilities we don't + * need at any point during execution (or which we gain back when we + * need by joining other namespaces). + * + * 2. isolate_user() + * ================= + * + * Executed once we know what user and user namespace we want to + * operate in. Sets our final UID & GID, and enters the correct user + * namespace. + * + * 3. isolate_prefork() + * ==================== + * + * Executed after all setup, but before daemonizing (fork()ing into + * the background). Uses mount namespace and pivot_root() to remove + * our access to the filesystem().filesystem() is not a function I know about. :)+ * + * 4. isolate_postfork() + * ===================== + * + * Executed immediately after daemonizing, but before entering the + * actual packet forwarding phase of operation. Or, if not + * daemonizing, immediately after isolate_prefork(). Uses seccomp() + * to restrict ourselves to the handful of syscalls we need during + * runtime operation. + */ #include <errno.h> #include <fcntl.h> @@ -47,7 +89,7 @@ /** * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE */ -void drop_caps(void) +static void drop_caps(void) { int i; @@ -59,12 +101,31 @@ void drop_caps(void) } } +/** + * isolate_initial() - Early, config independent self isolation + * + * Should: + * - drop unneeded capabilities + * Musn't: + * - remove filessytem access (we need to access files during setup) + */ +void isolate_initial(void) +{ + drop_caps(); +} + /** * isolate_user() - Switch to final UID/GID and move into userns * @uid: User ID to run as (in original userns) * @gid: Group ID to run as (in original userns) * @use_userns: Whether to join or create a userns * @userns: userns path to enter, may be empty + * + * Should: + * - set our final UID and GID + * - enter our final user namespace + * Mustn't: + * - remove filesystem access (we need that for further setup) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { @@ -134,11 +195,19 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } /** - * sandbox() - Unshare IPC, mount, PID, UTS, and user namespaces, "unmount" root + * isolate_prefork() - Self isolation before daemonizing + * @c: Execution context + *What does it return?+ * Should: + * - Moves us to our own IPC and UTS namespaces + * - Moves us to a mount namespace with only an empty directory + * - Drops unneeded capabilities (in the new user namespace)- move us ... - drop ... now, this will not move us into a new PID namespace, so it's a bit difficult to summarise in a very short form what it does with regard to that, and the comment below is already descriptive enough -- unless you can think of something.+ * Mustn't: + * - Remove syscalls we need to daemonize"remove", "daemonise", for consistency.* * Return: negative error code on failure, zero on success */ -int sandbox(struct ctx *c) +int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; @@ -187,13 +256,19 @@ int sandbox(struct ctx *c) } /** - * seccomp() - Set up seccomp filters depending on mode, won't return on failure + * isolate_postfork() - Self isolation after daemonizing * @c: Execution context + * + * Should: + * - disable core dumps + * - limit to a minimal set of syscalls */ -void seccomp(const struct ctx *c) +void isolate_postfork(const struct ctx *c) { struct sock_fprog prog; + prctl(PR_SET_DUMPABLE, 0); + if (c->mode == MODE_PASST) { prog.len = (unsigned short)ARRAY_SIZE(filter_passt); prog.filter = filter_passt; diff --git a/isolation.h b/isolation.h index 2c73df7..70a38f9 100644 --- a/isolation.h +++ b/isolation.h @@ -7,9 +7,9 @@ #ifndef ISOLATION_H #define ISOLATION_H -void drop_caps(void); +void isolate_initial(void); void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns); -int sandbox(struct ctx *c); -void seccomp(const struct ctx *c); +int isolate_prefork(struct ctx *c); +void isolate_postfork(const struct ctx *c); #endif /* ISOLATION_H */ diff --git a/passt.c b/passt.c index 2c4a986..46f80a0 100644 --- a/passt.c +++ b/passt.c @@ -184,7 +184,7 @@ int main(int argc, char **argv) arch_avx2_exec(argv); - drop_caps(); + isolate_initial(); c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1; @@ -287,7 +287,7 @@ int main(int argc, char **argv) } } - if (sandbox(&c)) { + if (isolate_prefork(&c)) { err("Failed to sandbox process, exiting\n"); exit(EXIT_FAILURE); } @@ -297,9 +297,7 @@ int main(int argc, char **argv) else write_pidfile(pidfile_fd, getpid()); - prctl(PR_SET_DUMPABLE, 0); - - seccomp(&c); + isolate_postfork(&c); timer_init(&c, &now);-- Stefano
On Thu, Oct 13, 2022 at 04:17:05AM +0200, Stefano Brivio wrote:I think this, in particular, is really a notable improvement. Same here, only nits: On Tue, 11 Oct 2022 16:40:13 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Ah, sure. Here in australia we mostly use british spelling, but tend to use 'izing' rather than 'ising'. I've updated it.We have a number of steps of self-isolation scattered across our code. Improve function names and add comments to make it clearer what the self isolation model is, what the steps do, and why they happen at the points they happen. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++---- isolation.h | 6 ++-- passt.c | 8 ++--- 3 files changed, 86 insertions(+), 13 deletions(-) diff --git a/isolation.c b/isolation.c index 124dea4..10cef05 100644 --- a/isolation.c +++ b/isolation.c @@ -12,6 +12,48 @@ * Author: Stefano Brivio <sbrivio(a)redhat.com> * Author: David Gibson <david(a)gibson.dropbear.id.au> */ +/** + * DOC: Theory of Operation + * + * For security the passt/pasta process performs a number of + * self-isolations steps, dropping capabilities, setting namespaces + * and otherwise minimizing the impact we can have on the system at + * large if we were compromised.I would try to be consistent with the BrE spelling we used everywhere else: minimising, daemonising.Heh, oops.+ * + * Obviously we can't isolate ourselves from resources before we've + * done anything we need to do with those resources, so we have + * multiple stages of self-isolation. In order these are: + * + * 1. isolate_initial() + * ==================== + * + * Executed immediately after startup, drops capabilities we don't + * need at any point during execution (or which we gain back when we + * need by joining other namespaces). + * + * 2. isolate_user() + * ================= + * + * Executed once we know what user and user namespace we want to + * operate in. Sets our final UID & GID, and enters the correct user + * namespace. + * + * 3. isolate_prefork() + * ==================== + * + * Executed after all setup, but before daemonizing (fork()ing into + * the background). Uses mount namespace and pivot_root() to remove + * our access to the filesystem().filesystem() is not a function I know about. :)I had that below, I've moved it up here.+ * + * 4. isolate_postfork() + * ===================== + * + * Executed immediately after daemonizing, but before entering the + * actual packet forwarding phase of operation. Or, if not + * daemonizing, immediately after isolate_prefork(). Uses seccomp() + * to restrict ourselves to the handful of syscalls we need during + * runtime operation. + */ #include <errno.h> #include <fcntl.h> @@ -47,7 +89,7 @@ /** * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE */ -void drop_caps(void) +static void drop_caps(void) { int i; @@ -59,12 +101,31 @@ void drop_caps(void) } } +/** + * isolate_initial() - Early, config independent self isolation + * + * Should: + * - drop unneeded capabilities + * Musn't: + * - remove filessytem access (we need to access files during setup) + */ +void isolate_initial(void) +{ + drop_caps(); +} + /** * isolate_user() - Switch to final UID/GID and move into userns * @uid: User ID to run as (in original userns) * @gid: Group ID to run as (in original userns) * @use_userns: Whether to join or create a userns * @userns: userns path to enter, may be empty + * + * Should: + * - set our final UID and GID + * - enter our final user namespace + * Mustn't: + * - remove filesystem access (we need that for further setup) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { @@ -134,11 +195,19 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } /** - * sandbox() - Unshare IPC, mount, PID, UTS, and user namespaces, "unmount" root + * isolate_prefork() - Self isolation before daemonizing + * @c: Execution context + *What does it return?Fixed.+ * Should: + * - Moves us to our own IPC and UTS namespaces + * - Moves us to a mount namespace with only an empty directory + * - Drops unneeded capabilities (in the new user namespace)- move us ... - drop ...now, this will not move us into a new PID namespace, so it's a bit difficult to summarise in a very short form what it does with regard to that, and the comment below is already descriptive enough -- unless you can think of something.Right. These lists aren't supposed to be totally exhaustive, just covering the key points.Adjusted.+ * Mustn't: + * - Remove syscalls we need to daemonize"remove", "daemonise", for consistency.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson* * Return: negative error code on failure, zero on success */ -int sandbox(struct ctx *c) +int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; @@ -187,13 +256,19 @@ int sandbox(struct ctx *c) } /** - * seccomp() - Set up seccomp filters depending on mode, won't return on failure + * isolate_postfork() - Self isolation after daemonizing * @c: Execution context + * + * Should: + * - disable core dumps + * - limit to a minimal set of syscalls */ -void seccomp(const struct ctx *c) +void isolate_postfork(const struct ctx *c) { struct sock_fprog prog; + prctl(PR_SET_DUMPABLE, 0); + if (c->mode == MODE_PASST) { prog.len = (unsigned short)ARRAY_SIZE(filter_passt); prog.filter = filter_passt; diff --git a/isolation.h b/isolation.h index 2c73df7..70a38f9 100644 --- a/isolation.h +++ b/isolation.h @@ -7,9 +7,9 @@ #ifndef ISOLATION_H #define ISOLATION_H -void drop_caps(void); +void isolate_initial(void); void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns); -int sandbox(struct ctx *c); -void seccomp(const struct ctx *c); +int isolate_prefork(struct ctx *c); +void isolate_postfork(const struct ctx *c); #endif /* ISOLATION_H */ diff --git a/passt.c b/passt.c index 2c4a986..46f80a0 100644 --- a/passt.c +++ b/passt.c @@ -184,7 +184,7 @@ int main(int argc, char **argv) arch_avx2_exec(argv); - drop_caps(); + isolate_initial(); c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1; @@ -287,7 +287,7 @@ int main(int argc, char **argv) } } - if (sandbox(&c)) { + if (isolate_prefork(&c)) { err("Failed to sandbox process, exiting\n"); exit(EXIT_FAILURE); } @@ -297,9 +297,7 @@ int main(int argc, char **argv) else write_pidfile(pidfile_fd, getpid()); - prctl(PR_SET_DUMPABLE, 0); - - seccomp(&c); + isolate_postfork(&c); timer_init(&c, &now);
Just spotted a typo: On Tue, 11 Oct 2022 16:40:13 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:@@ -59,12 +101,31 @@ void drop_caps(void) } } +/** + * isolate_initial() - Early, config independent self isolation + * + * Should: + * - drop unneeded capabilities + * Musn't: + * - remove filessytem access (we need to access files during setup)filesystem -- Stefano
On Thu, Oct 13, 2022 at 02:49:19PM +0200, Stefano Brivio wrote:Just spotted a typo: On Tue, 11 Oct 2022 16:40:13 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Corrected. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson@@ -59,12 +101,31 @@ void drop_caps(void) } } +/** + * isolate_initial() - Early, config independent self isolation + * + * Should: + * - drop unneeded capabilities + * Musn't: + * - remove filessytem access (we need to access files during setup)filesystem
In a few places we use the FWRITE() macro to open a file, replace it's contents with a given string and close it again. There's no real reason this needs to be a macro rather than just a function though. Turn it into a function 'write_file()' and make some ancillary cleanups while we're there: - Add a return code so the caller can handle giving a useful error message - Handle the case of short write()s (unlikely, but possible) - Add O_TRUNC, to make sure we replace the existing contents entirely Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 17 +++++++++-------- pasta.c | 4 ++-- util.c | 33 +++++++++++++++++++++++++++++++++ util.h | 13 +------------ 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/isolation.c b/isolation.c index 10cef05..4aa75e6 100644 --- a/isolation.c +++ b/isolation.c @@ -129,7 +129,8 @@ void isolate_initial(void) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { - char nsmap[BUFSIZ]; + char uidmap[BUFSIZ]; + char gidmap[BUFSIZ]; /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { @@ -184,14 +185,14 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } /* Configure user and group mappings */ - snprintf(nsmap, BUFSIZ, "0 %u 1", uid); - FWRITE("/proc/self/uid_map", nsmap, "Cannot set uid_map in namespace"); + snprintf(uidmap, BUFSIZ, "0 %u 1", uid); + snprintf(gidmap, BUFSIZ, "0 %u 1", gid); - FWRITE("/proc/self/setgroups", "deny", - "Cannot write to setgroups in namespace"); - - snprintf(nsmap, BUFSIZ, "0 %u 1", gid); - FWRITE("/proc/self/gid_map", nsmap, "Cannot set gid_map in namespace"); + if (write_file("/proc/self/uid_map", uidmap) || + write_file("/proc/self/setgroups", "deny") || + write_file("/proc/self/gid_map", gidmap)) { + warn("Couldn't configure user namespace"); + } } /** diff --git a/pasta.c b/pasta.c index 4ff322c..0ab2fe4 100644 --- a/pasta.c +++ b/pasta.c @@ -167,8 +167,8 @@ static int pasta_setup_ns(void *arg) const struct pasta_setup_ns_arg *a = (const struct pasta_setup_ns_arg *)arg; - FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", - "Cannot set ping_group_range, ICMP requests might fail"); + if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) + warn("Cannot set ping_group_range, ICMP requests might fail"); execvp(a->exe, a->argv); diff --git a/util.c b/util.c index 6b86ead..93b93b1 100644 --- a/util.c +++ b/util.c @@ -547,3 +547,36 @@ int fls(unsigned long x) return y; } + +/** + * write_file() - Replace contents of file with a string + * @path: File to write + * @buf: String to write + * + * Return: 0 on success, -1 on any error + */ +int write_file(const char *path, const char *buf) +{ + int fd = open(path, O_WRONLY | O_TRUNC | O_CLOEXEC); + size_t len = strlen(buf); + + if (fd < 0) { + warn("Could not open %s: %s", path, strerror(errno)); + return -1; + } + + while (len) { + ssize_t rc = write(fd, buf, len); + + if (rc < 0) { + warn("Couldn't write to %s: %s", path, strerror(errno)); + break; + } + + buf += rc; + len -= rc; + } + + close(fd); + return len == 0 ? 0 : -1; +} diff --git a/util.h b/util.h index 0c06e34..f957522 100644 --- a/util.h +++ b/util.h @@ -58,18 +58,6 @@ void trace_init(int enable); #define TMPDIR "/tmp" #endif -#define FWRITE(path, buf, str) \ - do { \ - int flags = O_WRONLY | O_CLOEXEC; \ - int fd = open(path, flags); \ - \ - if (fd < 0 || \ - write(fd, buf, strlen(buf)) != (int)strlen(buf)) \ - warn(str); \ - if (fd >= 0) \ - close(fd); \ - } while (0) - #define V4 0 #define V6 1 #define IP_VERSIONS 2 @@ -215,5 +203,6 @@ int ns_enter(const struct ctx *c); void write_pidfile(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); +int write_file(const char *path, const char *buf); #endif /* UTIL_H */ -- 2.37.3
On Tue, 11 Oct 2022 16:40:14 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:In a few places we use the FWRITE() macro to open a file, replace it's contents with a given string and close it again. There's no real reason this needs to be a macro rather than just a function though.Well, there's a bit of a reason: it gives more descriptive messages in isolate_user() with the same LoCs. On the other hand I would also prefer a function here, probably better than the alternative I'm not sure why this series needs this by the way.Turn it into a function 'write_file()' and make some ancillary cleanups while we're there: - Add a return code so the caller can handle giving a useful error message - Handle the case of short write()s (unlikely, but possible) - Add O_TRUNC, to make sure we replace the existing contents entirely Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 17 +++++++++-------- pasta.c | 4 ++-- util.c | 33 +++++++++++++++++++++++++++++++++ util.h | 13 +------------ 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/isolation.c b/isolation.c index 10cef05..4aa75e6 100644 --- a/isolation.c +++ b/isolation.c @@ -129,7 +129,8 @@ void isolate_initial(void) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { - char nsmap[BUFSIZ]; + char uidmap[BUFSIZ]; + char gidmap[BUFSIZ]; /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { @@ -184,14 +185,14 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } /* Configure user and group mappings */ - snprintf(nsmap, BUFSIZ, "0 %u 1", uid); - FWRITE("/proc/self/uid_map", nsmap, "Cannot set uid_map in namespace"); + snprintf(uidmap, BUFSIZ, "0 %u 1", uid); + snprintf(gidmap, BUFSIZ, "0 %u 1", gid); - FWRITE("/proc/self/setgroups", "deny", - "Cannot write to setgroups in namespace"); - - snprintf(nsmap, BUFSIZ, "0 %u 1", gid); - FWRITE("/proc/self/gid_map", nsmap, "Cannot set gid_map in namespace"); + if (write_file("/proc/self/uid_map", uidmap) || + write_file("/proc/self/setgroups", "deny") || + write_file("/proc/self/gid_map", gidmap)) { + warn("Couldn't configure user namespace");It's unlikely that we can write to uid_map but not to setgroups. Still, having separate messages, as we had them, could help investigating some issues.+ } } /** diff --git a/pasta.c b/pasta.c index 4ff322c..0ab2fe4 100644 --- a/pasta.c +++ b/pasta.c @@ -167,8 +167,8 @@ static int pasta_setup_ns(void *arg) const struct pasta_setup_ns_arg *a = (const struct pasta_setup_ns_arg *)arg; - FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", - "Cannot set ping_group_range, ICMP requests might fail"); + if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) + warn("Cannot set ping_group_range, ICMP requests might fail"); execvp(a->exe, a->argv); diff --git a/util.c b/util.c index 6b86ead..93b93b1 100644 --- a/util.c +++ b/util.c @@ -547,3 +547,36 @@ int fls(unsigned long x) return y; } + +/** + * write_file() - Replace contents of file with a string + * @path: File to write + * @buf: String to write + * + * Return: 0 on success, -1 on any error + */ +int write_file(const char *path, const char *buf)We could also use this for the PID file by optionally taking a file number, but I haven't tried how it looks like.+{ + int fd = open(path, O_WRONLY | O_TRUNC | O_CLOEXEC); + size_t len = strlen(buf); + + if (fd < 0) { + warn("Could not open %s: %s", path, strerror(errno)); + return -1; + } + + while (len) { + ssize_t rc = write(fd, buf, len); + + if (rc < 0) {I would change this to <= 0. Not that it matters with write(), but should we ever change that another function, we run the (absolutely not critical) risk of forgetting to adjust this and get stuck in a loop here.+ warn("Couldn't write to %s: %s", path, strerror(errno)); + break; + } + + buf += rc; + len -= rc; + } + + close(fd); + return len == 0 ? 0 : -1; +} diff --git a/util.h b/util.h index 0c06e34..f957522 100644 --- a/util.h +++ b/util.h @@ -58,18 +58,6 @@ void trace_init(int enable); #define TMPDIR "/tmp" #endif -#define FWRITE(path, buf, str) \ - do { \ - int flags = O_WRONLY | O_CLOEXEC; \ - int fd = open(path, flags); \ - \ - if (fd < 0 || \ - write(fd, buf, strlen(buf)) != (int)strlen(buf)) \ - warn(str); \ - if (fd >= 0) \ - close(fd); \ - } while (0) - #define V4 0 #define V6 1 #define IP_VERSIONS 2 @@ -215,5 +203,6 @@ int ns_enter(const struct ctx *c); void write_pidfile(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); +int write_file(const char *path, const char *buf); #endif /* UTIL_H */-- Stefano
On Thu, Oct 13, 2022 at 04:17:09AM +0200, Stefano Brivio wrote:On Tue, 11 Oct 2022 16:40:14 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Right, but there is also a message in write_file() itself.In a few places we use the FWRITE() macro to open a file, replace it's contents with a given string and close it again. There's no real reason this needs to be a macro rather than just a function though.Well, there's a bit of a reason: it gives more descriptive messages in isolate_user() with the same LoCs. On the other hand I would also prefer a function here, probably better than the alternative I'm not sure why this series needs this by the way.Turn it into a function 'write_file()' and make some ancillary cleanups while we're there: - Add a return code so the caller can handle giving a useful error message - Handle the case of short write()s (unlikely, but possible) - Add O_TRUNC, to make sure we replace the existing contents entirely Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 17 +++++++++-------- pasta.c | 4 ++-- util.c | 33 +++++++++++++++++++++++++++++++++ util.h | 13 +------------ 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/isolation.c b/isolation.c index 10cef05..4aa75e6 100644 --- a/isolation.c +++ b/isolation.c @@ -129,7 +129,8 @@ void isolate_initial(void) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { - char nsmap[BUFSIZ]; + char uidmap[BUFSIZ]; + char gidmap[BUFSIZ]; /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { @@ -184,14 +185,14 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } /* Configure user and group mappings */ - snprintf(nsmap, BUFSIZ, "0 %u 1", uid); - FWRITE("/proc/self/uid_map", nsmap, "Cannot set uid_map in namespace"); + snprintf(uidmap, BUFSIZ, "0 %u 1", uid); + snprintf(gidmap, BUFSIZ, "0 %u 1", gid); - FWRITE("/proc/self/setgroups", "deny", - "Cannot write to setgroups in namespace"); - - snprintf(nsmap, BUFSIZ, "0 %u 1", gid); - FWRITE("/proc/self/gid_map", nsmap, "Cannot set gid_map in namespace"); + if (write_file("/proc/self/uid_map", uidmap) || + write_file("/proc/self/setgroups", "deny") || + write_file("/proc/self/gid_map", gidmap)) { + warn("Couldn't configure user namespace");It's unlikely that we can write to uid_map but not to setgroups. Still, having separate messages, as we had them, could help investigating some issues.Yeah, maybe later.+ } } /** diff --git a/pasta.c b/pasta.c index 4ff322c..0ab2fe4 100644 --- a/pasta.c +++ b/pasta.c @@ -167,8 +167,8 @@ static int pasta_setup_ns(void *arg) const struct pasta_setup_ns_arg *a = (const struct pasta_setup_ns_arg *)arg; - FWRITE("/proc/sys/net/ipv4/ping_group_range", "0 0", - "Cannot set ping_group_range, ICMP requests might fail"); + if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) + warn("Cannot set ping_group_range, ICMP requests might fail"); execvp(a->exe, a->argv); diff --git a/util.c b/util.c index 6b86ead..93b93b1 100644 --- a/util.c +++ b/util.c @@ -547,3 +547,36 @@ int fls(unsigned long x) return y; } + +/** + * write_file() - Replace contents of file with a string + * @path: File to write + * @buf: String to write + * + * Return: 0 on success, -1 on any error + */ +int write_file(const char *path, const char *buf)We could also use this for the PID file by optionally taking a file number, but I haven't tried how it looks like.Fair enough, done.+{ + int fd = open(path, O_WRONLY | O_TRUNC | O_CLOEXEC); + size_t len = strlen(buf); + + if (fd < 0) { + warn("Could not open %s: %s", path, strerror(errno)); + return -1; + } + + while (len) { + ssize_t rc = write(fd, buf, len); + + if (rc < 0) {I would change this to <= 0. Not that it matters with write(), but should we ever change that another function, we run the (absolutely not critical) risk of forgetting to adjust this and get stuck in a loop here.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson+ warn("Couldn't write to %s: %s", path, strerror(errno)); + break; + } + + buf += rc; + len -= rc; + } + + close(fd); + return len == 0 ? 0 : -1; +} diff --git a/util.h b/util.h index 0c06e34..f957522 100644 --- a/util.h +++ b/util.h @@ -58,18 +58,6 @@ void trace_init(int enable); #define TMPDIR "/tmp" #endif -#define FWRITE(path, buf, str) \ - do { \ - int flags = O_WRONLY | O_CLOEXEC; \ - int fd = open(path, flags); \ - \ - if (fd < 0 || \ - write(fd, buf, strlen(buf)) != (int)strlen(buf)) \ - warn(str); \ - if (fd >= 0) \ - close(fd); \ - } while (0) - #define V4 0 #define V6 1 #define IP_VERSIONS 2 @@ -215,5 +203,6 @@ int ns_enter(const struct ctx *c); void write_pidfile(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); +int write_file(const char *path, const char *buf); #endif /* UTIL_H */
The current implementation of drop_caps() doesn't really work because it attempts to drop capabilities from the bounding set. hat's not the set that really matters: the bounding set is about limiting the abilities of otherwise things we might later exec() rather than our own capabilities. In addition altering the bounding set requires CAP_SETPCAP which we won't usually have. Replace it with a new version which uses setcap(2) to drop capabilities from the effective and permitted sets, which is what actually matters for most purposes. For now we leave the inheritable set alone, since we don't want to preclude the user from passing inheritable capabilities to the command spawed by pasta. Correctly dropping caps reveals that we actually need CAP_SYS_ADMIN within the userns we create/join in pasta mode, so that we can later setns() to the netns within it. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/isolation.c b/isolation.c index 4aa75e6..2468f84 100644 --- a/isolation.c +++ b/isolation.c @@ -86,18 +86,37 @@ #include "passt.h" #include "isolation.h" +#define CAP_VERSION _LINUX_CAPABILITY_VERSION_3 +#define CAP_WORDS _LINUX_CAPABILITY_U32S_3 + /** - * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE + * drop_caps_ep_except() - Drop capabilities from effective & permitted sets + * @keep: Capabilities to keep */ -static void drop_caps(void) +static void drop_caps_ep_except(uint64_t keep) { + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + struct __user_cap_data_struct data[CAP_WORDS]; int i; - for (i = 0; i < 64; i++) { - if (i == CAP_NET_BIND_SERVICE) - continue; + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + for (i = 0; i < CAP_WORDS; i++) { + uint32_t mask = keep >> (32 * i); + + data[i].effective &= mask; + data[i].permitted &= mask; + } - prctl(PR_CAPBSET_DROP, i, 0, 0, 0); + if (syscall(SYS_capset, &hdr, data)) { + err("Couldn't drop capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); } } @@ -111,7 +130,11 @@ static void drop_caps(void) */ void isolate_initial(void) { - drop_caps(); + /* We want to keep CAP_NET_BIND_SERVICE in the initial + * namespace if we have it, so that we can forward low ports + * into the guest/namespace + */ + drop_caps_ep_except((1UL << CAP_NET_BIND_SERVICE)); } /** @@ -211,6 +234,7 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; + uint64_t ns_caps = 0; /* If we run in foreground, we have no chance to actually move to a new * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody @@ -251,7 +275,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Drop capabilites in our new userns */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so that we can setns() to the + * netns when we need to act upon it + */ + ns_caps |= 1UL << CAP_SYS_ADMIN; + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= 1UL << CAP_NET_BIND_SERVICE; + } + + drop_caps_ep_except(ns_caps); return 0; } -- 2.37.3
Well, this drop_caps() is pretty much the same as patch 8/10, so it actually did something. :) On Tue, 11 Oct 2022 16:40:15 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:The current implementation of drop_caps() doesn't really work because it attempts to drop capabilities from the bounding set. hat's not the set that really matters: the bounding set is about limiting the abilities of otherwise things we might later exec() rather than our own capabilities. In addition altering the bounding set requires CAP_SETPCAP which we won't usually have. Replace it with a new version which uses setcap(2) to drop capabilities from the effective and permitted sets, which is what actually matters for most purposes. For now we leave the inheritable set alone, since we don't want to preclude the user from passing inheritable capabilities to the command spawed by pasta. Correctly dropping caps reveals that we actually need CAP_SYS_ADMIN within the userns we create/join in pasta mode, so that we can later setns() to the netns within it. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/isolation.c b/isolation.c index 4aa75e6..2468f84 100644 --- a/isolation.c +++ b/isolation.c @@ -86,18 +86,37 @@ #include "passt.h" #include "isolation.h" +#define CAP_VERSION _LINUX_CAPABILITY_VERSION_3 +#define CAP_WORDS _LINUX_CAPABILITY_U32S_3 + /** - * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE + * drop_caps_ep_except() - Drop capabilities from effective & permitted sets + * @keep: Capabilities to keep */ -static void drop_caps(void) +static void drop_caps_ep_except(uint64_t keep) { + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + struct __user_cap_data_struct data[CAP_WORDS]; int i; - for (i = 0; i < 64; i++) { - if (i == CAP_NET_BIND_SERVICE) - continue; + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + for (i = 0; i < CAP_WORDS; i++) { + uint32_t mask = keep >> (32 * i); + + data[i].effective &= mask; + data[i].permitted &= mask; + } - prctl(PR_CAPBSET_DROP, i, 0, 0, 0); + if (syscall(SYS_capset, &hdr, data)) { + err("Couldn't drop capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); } } @@ -111,7 +130,11 @@ static void drop_caps(void) */ void isolate_initial(void) { - drop_caps(); + /* We want to keep CAP_NET_BIND_SERVICE in the initial + * namespace if we have it, so that we can forward low ports + * into the guest/namespace + */ + drop_caps_ep_except((1UL << CAP_NET_BIND_SERVICE));You could use BIT() (util.h) here,} /** @@ -211,6 +234,7 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; + uint64_t ns_caps = 0; /* If we run in foreground, we have no chance to actually move to a new * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody @@ -251,7 +275,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Drop capabilites in our new userns */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so that we can setns() to the + * netns when we need to act upon it + */ + ns_caps |= 1UL << CAP_SYS_ADMIN;here,+ /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= 1UL << CAP_NET_BIND_SERVICE;and here.+ } + + drop_caps_ep_except(ns_caps); return 0; }-- Stefano
On Thu, Oct 13, 2022 at 04:18:24AM +0200, Stefano Brivio wrote:Well, this drop_caps() is pretty much the same as patch 8/10, so it actually did something. :)Yes, but not what we wanted :).On Tue, 11 Oct 2022 16:40:15 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Ah, yes, done.The current implementation of drop_caps() doesn't really work because it attempts to drop capabilities from the bounding set. hat's not the set that really matters: the bounding set is about limiting the abilities of otherwise things we might later exec() rather than our own capabilities. In addition altering the bounding set requires CAP_SETPCAP which we won't usually have. Replace it with a new version which uses setcap(2) to drop capabilities from the effective and permitted sets, which is what actually matters for most purposes. For now we leave the inheritable set alone, since we don't want to preclude the user from passing inheritable capabilities to the command spawed by pasta. Correctly dropping caps reveals that we actually need CAP_SYS_ADMIN within the userns we create/join in pasta mode, so that we can later setns() to the netns within it. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 52 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/isolation.c b/isolation.c index 4aa75e6..2468f84 100644 --- a/isolation.c +++ b/isolation.c @@ -86,18 +86,37 @@ #include "passt.h" #include "isolation.h" +#define CAP_VERSION _LINUX_CAPABILITY_VERSION_3 +#define CAP_WORDS _LINUX_CAPABILITY_U32S_3 + /** - * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE + * drop_caps_ep_except() - Drop capabilities from effective & permitted sets + * @keep: Capabilities to keep */ -static void drop_caps(void) +static void drop_caps_ep_except(uint64_t keep) { + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + struct __user_cap_data_struct data[CAP_WORDS]; int i; - for (i = 0; i < 64; i++) { - if (i == CAP_NET_BIND_SERVICE) - continue; + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + for (i = 0; i < CAP_WORDS; i++) { + uint32_t mask = keep >> (32 * i); + + data[i].effective &= mask; + data[i].permitted &= mask; + } - prctl(PR_CAPBSET_DROP, i, 0, 0, 0); + if (syscall(SYS_capset, &hdr, data)) { + err("Couldn't drop capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); } } @@ -111,7 +130,11 @@ static void drop_caps(void) */ void isolate_initial(void) { - drop_caps(); + /* We want to keep CAP_NET_BIND_SERVICE in the initial + * namespace if we have it, so that we can forward low ports + * into the guest/namespace + */ + drop_caps_ep_except((1UL << CAP_NET_BIND_SERVICE));You could use BIT() (util.h) here,-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson} /** @@ -211,6 +234,7 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; + uint64_t ns_caps = 0; /* If we run in foreground, we have no chance to actually move to a new * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody @@ -251,7 +275,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Drop capabilites in our new userns */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so that we can setns() to the + * netns when we need to act upon it + */ + ns_caps |= 1UL << CAP_SYS_ADMIN;here,+ /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= 1UL << CAP_NET_BIND_SERVICE;and here.+ } + + drop_caps_ep_except(ns_caps); return 0; }
On Tue, 11 Oct 2022 16:40:15 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:@@ -251,7 +275,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Drop capabilites in our new userns */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so that we can setns() to the + * netns when we need to act upon it + */ + ns_caps |= 1UL << CAP_SYS_ADMIN; + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= 1UL << CAP_NET_BIND_SERVICE; + } + + drop_caps_ep_except(ns_caps);Hmm, I didn't really look into this yet, but there seems to be an issue with filesystem-bound network namespaces now. Running something like: pasta --config-net --netns /run/user/1000/netns/netns-6466ff4b-1efc-2b58-685b-cbc12feb9ccc (from Podman), this happens: readlink("/proc/self/exe", "/usr/local/bin/passt.avx2", 4095) = 25 capget({version=_LINUX_CAPABILITY_VERSION_3, pid=0}, {effective=1<<CAP_CHOWN|1<<CAP_DAC_OVERRIDE|1<<CAP_DAC_READ_SEARCH|1<<CAP_FOWNER|1<<CAP_FSETID|1<<CAP_KILL|1<<CAP_SETGID|1<<CAP_SETUID|1<<CAP_SETPCAP|1<<CAP_LINUX_IMMUTABLE|1<<CAP_NET_BIND_SERVICE|1<<CAP_NET_BROADCAST|1<<CAP_NET_ADMIN|1<<CAP_NET_RAW|1<<CAP_IPC_LOCK|1<<CAP_IPC_OWNER|1<<CAP_SYS_MODULE|1<<CAP_SYS_RAWIO|1<<CAP_SYS_CHROOT|1<<CAP_SYS_PTRACE|1<<CAP_SYS_PACCT|1<<CAP_SYS_ADMIN|1<<CAP_SYS_BOOT|1<<CAP_SYS_NICE|1<<CAP_SYS_RESOURCE|1<<CAP_SYS_TIME|1<<CAP_SYS_TTY_CONFIG|1<<CAP_MKNOD|1<<CAP_LEASE|1<<CAP_AUDIT_WRITE|1<<CAP_AUDIT_CONTROL|1<<CAP_SETFCAP|1<<CAP_MAC_OVERRIDE|1<<CAP_MAC_ADMIN|1<<CAP_SYSLOG|1<<CAP_WAKE_ALARM|1<<CAP_BLOCK_SUSPEND|1<<CAP_AUDIT_READ|1<<CAP_PERFMON|1<<CAP_BPF|1<<CAP_CHECKPOINT_RESTORE, permitted=1<<CAP_CHOWN|1<<CAP_DAC_OVERRIDE|1<<CAP_DAC_READ_SEARCH|1<<CAP_FOWNER|1<<CAP_FSETID|1<<CAP_KILL|1<<CAP_SETGID|1<<CAP_SETUID|1<<CAP_SETPCAP|1<<CAP_LINUX_IMMUTABLE|1<<CAP_NET_BIND_SERVICE|1<<CAP_NET_BROADCAST|1<<CAP_N ET_ADMIN|1<<CAP_NET_RAW|1<<CAP_IPC_LOCK|1<<CAP_IPC_OWNER|1<<CAP_SYS_MODULE|1<<CAP_SYS_RAWIO|1<<CAP_SYS_CHROOT|1<<CAP_SYS_PTRACE|1<<CAP_SYS_PACCT|1<<CAP_SYS_ADMIN|1<<CAP_SYS_BOOT|1<<CAP_SYS_NICE|1<<CAP_SYS_RESOURCE|1<<CAP_SYS_TIME|1<<CAP_SYS_TTY_CONFIG|1<<CAP_MKNOD|1<<CAP_LEASE|1<<CAP_AUDIT_WRITE|1<<CAP_AUDIT_CONTROL|1<<CAP_SETFCAP|1<<CAP_MAC_OVERRIDE|1<<CAP_MAC_ADMIN|1<<CAP_SYSLOG|1<<CAP_WAKE_ALARM|1<<CAP_BLOCK_SUSPEND|1<<CAP_AUDIT_READ|1<<CAP_PERFMON|1<<CAP_BPF|1<<CAP_CHECKPOINT_RESTORE, inheritable=0}) = 0 capset({version=_LINUX_CAPABILITY_VERSION_3, pid=0}, {effective=1<<CAP_NET_BIND_SERVICE, permitted=1<<CAP_NET_BIND_SERVICE, inheritable=0}) = 0 [...] getegid() = 0 openat(AT_FDCWD, "/proc/self/uid_map", O_RDONLY|O_CLOEXEC) = 7 read(7, " 0 1000 1"..., 8192) = 66 close(7) = 0 setgroups(0, NULL) = -1 EPERM (Operation not permitted) setgid(0) = 0 setuid(0) = 0 openat(AT_FDCWD, "/run/user/1000/netns/netns-6466ff4b-1efc-2b58-685b-cbc12feb9ccc", O_RDONLY|O_CLOEXEC) = 7 clone(child_stack=0x7ffef5a229a0, flags=CLONE_VM|CLONE_FILES|CLONE_VFORK|SIGCHLDstrace: Process 1763223 attached <unfinished ...> [pid 1763223] setns(7, CLONE_NEWNET) = -1 EPERM (Operation not permitted) [pid 1763223] exit(0) = ? [pid 1763222] <... clone resumed>) = 1763223 [pid 1763223] +++ exited with 0 +++ --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=1763223, si_uid=0, si_status=0, si_utime=0, si_stime=0} --- waitid(P_ALL, 0, NULL, WNOHANG|WEXITED, NULL) = 0 waitid(P_ALL, 0, NULL, WNOHANG|WEXITED, NULL) = -1 ECHILD (No child processes) rt_sigreturn({mask=[]}) = 1763223 sendto(5, "<3> Couldn't switch to pasta nam"..., 40, 0, NULL, 0) = 40 write(2, "Couldn't switch to pasta namespa"..., 35Couldn't switch to pasta namespaces) = 35 write(2, "\n", 1 -- Stefano
On Thu, 13 Oct 2022 06:01:19 +0200 Stefano Brivio <sbrivio(a)redhat.com> wrote:On Tue, 11 Oct 2022 16:40:15 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Ah, "of course". Podman calls us with UID 0 in the user namespace it just created, so if we drop CAP_SYS_ADMIN in isolate_initial() we can't join the network namespace, and if we drop CAP_NET_ADMIN we can't configure it. So for that case (and only for that, I suppose), we need something like (tested): diff --git a/isolation.c b/isolation.c index 1769180..fee6dbd 100644 --- a/isolation.c +++ b/isolation.c @@ -190,7 +190,7 @@ void isolate_initial(void) * namespace if we have it, so that we can forward low ports * into the guest/namespace */ - drop_caps_ep_except((1UL << CAP_NET_BIND_SERVICE)); + drop_caps_ep_except(BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN)); } ...which is a bit pointless. Better than *any* capability, but not by far. So, if we make this totally independent from configuration, we need those two capabilities. We could add a "postconf" stage and cover a tiny bit more of conf.c. Or we could have a special path in isolate_initial() for the case we know we're not in the init namespace. I'm not sure. If you have a specific preference/strong opinion I would actually be happier. :) -- Stefano@@ -251,7 +275,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Drop capabilites in our new userns */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so that we can setns() to the + * netns when we need to act upon it + */ + ns_caps |= 1UL << CAP_SYS_ADMIN; + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= 1UL << CAP_NET_BIND_SERVICE; + } + + drop_caps_ep_except(ns_caps);Hmm, I didn't really look into this yet, but there seems to be an issue with filesystem-bound network namespaces now. Running something like: pasta --config-net --netns /run/user/1000/netns/netns-6466ff4b-1efc-2b58-685b-cbc12feb9ccc (from Podman), this happens: [...] [pid 1763223] setns(7, CLONE_NEWNET) = -1 EPERM (Operation not permitted)
On Thu, 13 Oct 2022 15:08:02 +0200 Stefano Brivio <sbrivio(a)redhat.com> wrote:On Thu, 13 Oct 2022 06:01:19 +0200 Stefano Brivio <sbrivio(a)redhat.com> wrote:Further on, if we are started as root, we'll fail to drop to 'nobody' or any other user, if we lose CAP_SETUID and CAP_SETGID here. I have tested this version of isolate_initial(): drop_caps_ep_except(BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN) | BIT(CAP_SETGID) | BIT(CAP_SETUID) | BIT(CAP_NET_BIND_SERVICE)); for any use case I can reasonably think of. Yes, it's a lot -- we should make it really clear that those are not the capabilities we actually use at "run time". -- StefanoOn Tue, 11 Oct 2022 16:40:15 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Ah, "of course". Podman calls us with UID 0 in the user namespace it just created, so if we drop CAP_SYS_ADMIN in isolate_initial() we can't join the network namespace, and if we drop CAP_NET_ADMIN we can't configure it. So for that case (and only for that, I suppose), we need something like (tested): diff --git a/isolation.c b/isolation.c index 1769180..fee6dbd 100644 --- a/isolation.c +++ b/isolation.c @@ -190,7 +190,7 @@ void isolate_initial(void) * namespace if we have it, so that we can forward low ports * into the guest/namespace */ - drop_caps_ep_except((1UL << CAP_NET_BIND_SERVICE)); + drop_caps_ep_except(BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN)); } ...which is a bit pointless. Better than *any* capability, but not by far. So, if we make this totally independent from configuration, we need those two capabilities. We could add a "postconf" stage and cover a tiny bit more of conf.c. Or we could have a special path in isolate_initial() for the case we know we're not in the init namespace. I'm not sure. If you have a specific preference/strong opinion I would actually be happier. :)@@ -251,7 +275,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Drop capabilites in our new userns */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so that we can setns() to the + * netns when we need to act upon it + */ + ns_caps |= 1UL << CAP_SYS_ADMIN; + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= 1UL << CAP_NET_BIND_SERVICE; + } + + drop_caps_ep_except(ns_caps);Hmm, I didn't really look into this yet, but there seems to be an issue with filesystem-bound network namespaces now. Running something like: pasta --config-net --netns /run/user/1000/netns/netns-6466ff4b-1efc-2b58-685b-cbc12feb9ccc (from Podman), this happens: [...] [pid 1763223] setns(7, CLONE_NEWNET) = -1 EPERM (Operation not permitted)
On Thu, Oct 13, 2022 at 06:37:05PM +0200, Stefano Brivio wrote:On Thu, 13 Oct 2022 15:08:02 +0200 Stefano Brivio <sbrivio(a)redhat.com> wrote:Ah, right. I didn't think through the --netns-only case. I think what we need to do in the short term at least is: * Add all those caps and a comment in isolate_initial() (or even just remove the drop_caps there) * In isolate user drop the caps we can at that point (SETGID and SETUID) - whether or not we've joined a new userns * Remove the rest of the "runtime" caps in isolate_prefork() like we do now. I'll rework accordingly. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonOn Thu, 13 Oct 2022 06:01:19 +0200 Stefano Brivio <sbrivio(a)redhat.com> wrote:Further on, if we are started as root, we'll fail to drop to 'nobody' or any other user, if we lose CAP_SETUID and CAP_SETGID here. I have tested this version of isolate_initial(): drop_caps_ep_except(BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN) | BIT(CAP_SETGID) | BIT(CAP_SETUID) | BIT(CAP_NET_BIND_SERVICE)); for any use case I can reasonably think of. Yes, it's a lot -- we should make it really clear that those are not the capabilities we actually use at "run time".On Tue, 11 Oct 2022 16:40:15 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Ah, "of course". Podman calls us with UID 0 in the user namespace it just created, so if we drop CAP_SYS_ADMIN in isolate_initial() we can't join the network namespace, and if we drop CAP_NET_ADMIN we can't configure it. So for that case (and only for that, I suppose), we need something like (tested): diff --git a/isolation.c b/isolation.c index 1769180..fee6dbd 100644 --- a/isolation.c +++ b/isolation.c @@ -190,7 +190,7 @@ void isolate_initial(void) * namespace if we have it, so that we can forward low ports * into the guest/namespace */ - drop_caps_ep_except((1UL << CAP_NET_BIND_SERVICE)); + drop_caps_ep_except(BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN)); } ...which is a bit pointless. Better than *any* capability, but not by far. So, if we make this totally independent from configuration, we need those two capabilities. We could add a "postconf" stage and cover a tiny bit more of conf.c. Or we could have a special path in isolate_initial() for the case we know we're not in the init namespace. I'm not sure. If you have a specific preference/strong opinion I would actually be happier. :)@@ -251,7 +275,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Drop capabilites in our new userns */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so that we can setns() to the + * netns when we need to act upon it + */ + ns_caps |= 1UL << CAP_SYS_ADMIN; + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= 1UL << CAP_NET_BIND_SERVICE; + } + + drop_caps_ep_except(ns_caps);Hmm, I didn't really look into this yet, but there seems to be an issue with filesystem-bound network namespaces now. Running something like: pasta --config-net --netns /run/user/1000/netns/netns-6466ff4b-1efc-2b58-685b-cbc12feb9ccc (from Podman), this happens: [...] [pid 1763223] setns(7, CLONE_NEWNET) = -1 EPERM (Operation not permitted)
We drop our own capabilities, but it's possible that processes we exec() could gain extra privilege via file capabilities. It shouldn't be possible for us to exec() anyway due to seccomp() and our filesystem isolation. But just in case, zero the bounding and inheritable capability sets to prevent any such child from gainin privilege. Note that we do this *after* spawning the pasta shell/command (if any), because we do want the user to be able to give that privilege if they want. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/isolation.c b/isolation.c index 2468f84..e1a024d 100644 --- a/isolation.c +++ b/isolation.c @@ -120,6 +120,61 @@ static void drop_caps_ep_except(uint64_t keep) } } +/** + * clamp_caps() - Prevent any children from gaining caps + * + * This drops all capabilities from both the inheritable and the + * bounding set. This means that any exec()ed processes can't gain + * capabilities, even if they have file capabilities which would grant + * them. We shouldn't ever exec() in any case, but this provides an + * additional layer of protection. Executing this requires + * CAP_SETPCAP, which we will have within our userns. + * + * Note that dropping capabilites from the bounding set limits + * exec()ed processes, but does not remove them from the effective or + * permitted sets, so it doesn't reduce our own capabilities. + */ +static void clamp_caps(void) +{ + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + struct __user_cap_data_struct data[CAP_WORDS]; + int i; + + for (i = 0; i < 64; i++) { + /* Some errors can be ignored: + * - EINVAL, we'll get this for all values in 0..63 + * that are not actually allocated caps + * - EPERM, we'll get this if we don't have + * CAP_SETPCAP, which can happen if using + * --netns-only. We don't need CAP_SETPCAP for + * normal operation, so carry on without it. + */ + if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) && + errno != EINVAL && errno != EPERM) { + err("Couldn't drop cap %i from bounding set: %s", + i, strerror(errno)); + exit(EXIT_FAILURE); + } + } + + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + for (i = 0; i < CAP_WORDS; i++) + data[i].inheritable = 0; + + if (syscall(SYS_capset, &hdr, data)) { + err("Couldn't drop inheritable capabilities: %s", + strerror(errno)); + exit(EXIT_FAILURE); + } +} + /** * isolate_initial() - Early, config independent self isolation * @@ -287,6 +342,7 @@ int isolate_prefork(struct ctx *c) ns_caps |= 1UL << CAP_NET_BIND_SERVICE; } + clamp_caps(); drop_caps_ep_except(ns_caps); return 0; -- 2.37.3
On Tue, 11 Oct 2022 16:40:16 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:We drop our own capabilities, but it's possible that processes we exec() could gain extra privilege via file capabilities. It shouldn't be possible for us to exec() anyway due to seccomp() and our filesystem isolation. But just in case, zero the bounding and inheritable capability sets to prevent any such child from gainin privilege. Note that we do this *after* spawning the pasta shell/command (if any), because we do want the user to be able to give that privilege if they want. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/isolation.c b/isolation.c index 2468f84..e1a024d 100644 --- a/isolation.c +++ b/isolation.c @@ -120,6 +120,61 @@ static void drop_caps_ep_except(uint64_t keep) } } +/** + * clamp_caps() - Prevent any children from gaining caps"clamp" doesn't sound very specific or clear. caps_drop_inherit_bound() would actually tell me what the function does, but it's a bit of a mouthful in comparison. I'm fine with both, really, but if you have a better idea...+ * + * This drops all capabilities from both the inheritable and the + * bounding set. This means that any exec()ed processes can't gain + * capabilities, even if they have file capabilities which would grant + * them. We shouldn't ever exec() in any case, but this provides an + * additional layer of protection. Executing this requires + * CAP_SETPCAP, which we will have within our userns. + * + * Note that dropping capabilites from the bounding set limits + * exec()ed processes, but does not remove them from the effective or + * permitted sets, so it doesn't reduce our own capabilities. + */ +static void clamp_caps(void) +{ + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + struct __user_cap_data_struct data[CAP_WORDS];For consistency, I'd move this before hdr.+ int i; + + for (i = 0; i < 64; i++) { + /* Some errors can be ignored: + * - EINVAL, we'll get this for all values in 0..63 + * that are not actually allocated caps + * - EPERM, we'll get this if we don't have + * CAP_SETPCAP, which can happen if using + * --netns-only. We don't need CAP_SETPCAP for + * normal operation, so carry on without it. + */ + if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) && + errno != EINVAL && errno != EPERM) { + err("Couldn't drop cap %i from bounding set: %s", + i, strerror(errno)); + exit(EXIT_FAILURE); + } + } + + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + for (i = 0; i < CAP_WORDS; i++) + data[i].inheritable = 0;Any specific reason why? Initialisers can have variable sizes to some extent, but if there's a reason why it can't be done, perhaps that would warrant a comment here.+ + if (syscall(SYS_capset, &hdr, data)) { + err("Couldn't drop inheritable capabilities: %s", + strerror(errno)); + exit(EXIT_FAILURE); + } +} + /** * isolate_initial() - Early, config independent self isolation * @@ -287,6 +342,7 @@ int isolate_prefork(struct ctx *c) ns_caps |= 1UL << CAP_NET_BIND_SERVICE; } + clamp_caps(); drop_caps_ep_except(ns_caps); return 0;-- Stefano
On Thu, Oct 13, 2022 at 04:17:30AM +0200, Stefano Brivio wrote:On Tue, 11 Oct 2022 16:40:16 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Yeah, I couldn't think of something that was both brief and specific, so I went with brief.We drop our own capabilities, but it's possible that processes we exec() could gain extra privilege via file capabilities. It shouldn't be possible for us to exec() anyway due to seccomp() and our filesystem isolation. But just in case, zero the bounding and inheritable capability sets to prevent any such child from gainin privilege. Note that we do this *after* spawning the pasta shell/command (if any), because we do want the user to be able to give that privilege if they want. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/isolation.c b/isolation.c index 2468f84..e1a024d 100644 --- a/isolation.c +++ b/isolation.c @@ -120,6 +120,61 @@ static void drop_caps_ep_except(uint64_t keep) } } +/** + * clamp_caps() - Prevent any children from gaining caps"clamp" doesn't sound very specific or clear. caps_drop_inherit_bound() would actually tell me what the function does, but it's a bit of a mouthful in comparison. I'm fine with both, really, but if you have a better idea...Ok.+ * + * This drops all capabilities from both the inheritable and the + * bounding set. This means that any exec()ed processes can't gain + * capabilities, even if they have file capabilities which would grant + * them. We shouldn't ever exec() in any case, but this provides an + * additional layer of protection. Executing this requires + * CAP_SETPCAP, which we will have within our userns. + * + * Note that dropping capabilites from the bounding set limits + * exec()ed processes, but does not remove them from the effective or + * permitted sets, so it doesn't reduce our own capabilities. + */ +static void clamp_caps(void) +{ + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + struct __user_cap_data_struct data[CAP_WORDS];For consistency, I'd move this before hdr.Why what? We're not trying to alter the permitted or effective sets here, so we're doing a capget() first, zeroing the inheritable field, then setting it back again.+ int i; + + for (i = 0; i < 64; i++) { + /* Some errors can be ignored: + * - EINVAL, we'll get this for all values in 0..63 + * that are not actually allocated caps + * - EPERM, we'll get this if we don't have + * CAP_SETPCAP, which can happen if using + * --netns-only. We don't need CAP_SETPCAP for + * normal operation, so carry on without it. + */ + if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) && + errno != EINVAL && errno != EPERM) { + err("Couldn't drop cap %i from bounding set: %s", + i, strerror(errno)); + exit(EXIT_FAILURE); + } + } + + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + for (i = 0; i < CAP_WORDS; i++) + data[i].inheritable = 0;Any specific reason why? Initialisers can have variable sizes to some extent, but if there's a reason why it can't be done, perhaps that would warrant a comment here.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson+ + if (syscall(SYS_capset, &hdr, data)) { + err("Couldn't drop inheritable capabilities: %s", + strerror(errno)); + exit(EXIT_FAILURE); + } +} + /** * isolate_initial() - Early, config independent self isolation * @@ -287,6 +342,7 @@ int isolate_prefork(struct ctx *c) ns_caps |= 1UL << CAP_NET_BIND_SERVICE; } + clamp_caps(); drop_caps_ep_except(ns_caps); return 0;
On Thu, 13 Oct 2022 20:33:34 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Thu, Oct 13, 2022 at 04:17:30AM +0200, Stefano Brivio wrote:Oops, never mind, of course, I missed the capget() for a moment. -- StefanoOn Tue, 11 Oct 2022 16:40:16 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Yeah, I couldn't think of something that was both brief and specific, so I went with brief.We drop our own capabilities, but it's possible that processes we exec() could gain extra privilege via file capabilities. It shouldn't be possible for us to exec() anyway due to seccomp() and our filesystem isolation. But just in case, zero the bounding and inheritable capability sets to prevent any such child from gainin privilege. Note that we do this *after* spawning the pasta shell/command (if any), because we do want the user to be able to give that privilege if they want. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- isolation.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/isolation.c b/isolation.c index 2468f84..e1a024d 100644 --- a/isolation.c +++ b/isolation.c @@ -120,6 +120,61 @@ static void drop_caps_ep_except(uint64_t keep) } } +/** + * clamp_caps() - Prevent any children from gaining caps"clamp" doesn't sound very specific or clear. caps_drop_inherit_bound() would actually tell me what the function does, but it's a bit of a mouthful in comparison. I'm fine with both, really, but if you have a better idea...Ok.+ * + * This drops all capabilities from both the inheritable and the + * bounding set. This means that any exec()ed processes can't gain + * capabilities, even if they have file capabilities which would grant + * them. We shouldn't ever exec() in any case, but this provides an + * additional layer of protection. Executing this requires + * CAP_SETPCAP, which we will have within our userns. + * + * Note that dropping capabilites from the bounding set limits + * exec()ed processes, but does not remove them from the effective or + * permitted sets, so it doesn't reduce our own capabilities. + */ +static void clamp_caps(void) +{ + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + struct __user_cap_data_struct data[CAP_WORDS];For consistency, I'd move this before hdr.Why what? We're not trying to alter the permitted or effective sets here, so we're doing a capget() first, zeroing the inheritable field, then setting it back again.+ int i; + + for (i = 0; i < 64; i++) { + /* Some errors can be ignored: + * - EINVAL, we'll get this for all values in 0..63 + * that are not actually allocated caps + * - EPERM, we'll get this if we don't have + * CAP_SETPCAP, which can happen if using + * --netns-only. We don't need CAP_SETPCAP for + * normal operation, so carry on without it. + */ + if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) && + errno != EINVAL && errno != EPERM) { + err("Couldn't drop cap %i from bounding set: %s", + i, strerror(errno)); + exit(EXIT_FAILURE); + } + } + + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + + for (i = 0; i < CAP_WORDS; i++) + data[i].inheritable = 0;Any specific reason why? Initialisers can have variable sizes to some extent, but if there's a reason why it can't be done, perhaps that would warrant a comment here.
When in passt mode, or pasta mode spawning a command, we create a userns for ourselves. This is used both to isolate the pasta/passt process itself and to run the spawned command, if any. Since eed17a47 "Handle userns isolation and dropping root at the same time" we've handled both cases the same, configuring the UID and GID mappings in the new userns to map whichever UID we're running as to root within the userns. This mapping is desirable when spawning a shell or other command, so that the user gets a root shell with reasonably clear abilities within the userns and netns. It's not necessarily essential, though. When not spawning a shell, it doesn't really have any purpose: passt itself doesn't need to be root and can operate fine with an unmapped user (using some of the capabilities we get when entering the userns instead). Configuring the uid_map can cause problems if passt is running with any capabilities in the initial namespace, such as CAP_NET_BIND_SERVICE to allow it to forward low ports. In this case the kernel makes files in /proc/pid owned by root rather than the starting user to prevent the user from interfering with the operation of the capability-enhanced process. This includes uid_map meaning we are not able to write to it. Whether this behaviour is correct in the kernel is debatable, but in any case we might as well avoid problems by only initializing the user mappings when we really want them. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- conf.c | 3 ++- isolation.c | 13 ------------- pasta.c | 16 ++++++++++++++-- pasta.h | 3 ++- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/conf.c b/conf.c index 1537dbf..b7661b6 100644 --- a/conf.c +++ b/conf.c @@ -1478,7 +1478,8 @@ void conf(struct ctx *c, int argc, char **argv) if (*netns) { pasta_open_ns(c, netns); } else { - pasta_start_ns(c, argc - optind, argv + optind); + pasta_start_ns(c, uid, gid, + argc - optind, argv + optind); } } diff --git a/isolation.c b/isolation.c index e1a024d..b94226d 100644 --- a/isolation.c +++ b/isolation.c @@ -207,9 +207,6 @@ void isolate_initial(void) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { - char uidmap[BUFSIZ]; - char gidmap[BUFSIZ]; - /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { /* If we don't have CAP_SETGID, this will EPERM */ @@ -261,16 +258,6 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) err("Couldn't create user namespace: %s", strerror(errno)); exit(EXIT_FAILURE); } - - /* Configure user and group mappings */ - snprintf(uidmap, BUFSIZ, "0 %u 1", uid); - snprintf(gidmap, BUFSIZ, "0 %u 1", gid); - - if (write_file("/proc/self/uid_map", uidmap) || - write_file("/proc/self/setgroups", "deny") || - write_file("/proc/self/gid_map", gidmap)) { - warn("Couldn't configure user namespace"); - } } /** diff --git a/pasta.c b/pasta.c index 0ab2fe4..9666fed 100644 --- a/pasta.c +++ b/pasta.c @@ -166,7 +166,6 @@ static int pasta_setup_ns(void *arg) { const struct pasta_setup_ns_arg *a = (const struct pasta_setup_ns_arg *)arg; - if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) warn("Cannot set ping_group_range, ICMP requests might fail"); @@ -179,16 +178,20 @@ static int pasta_setup_ns(void *arg) /** * pasta_start_ns() - Fork command in new namespace if target ns is not given * @c: Execution context + * @uid: UID we're running as in the init namespace + * @gid: GID we're running as in the init namespace * @argc: Number of arguments for spawned command * @argv: Command to spawn and arguments */ -void pasta_start_ns(struct ctx *c, int argc, char *argv[]) +void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, + int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { .exe = argv[0], .argv = argv, }; char *sh_argv[] = { NULL, NULL }; + char uidmap[BUFSIZ], gidmap[BUFSIZ]; char ns_fn_stack[NS_FN_STACK_SIZE]; char sh_arg0[PATH_MAX + 1]; @@ -196,7 +199,16 @@ void pasta_start_ns(struct ctx *c, int argc, char *argv[]) if (!c->debug) c->quiet = 1; + /* Configure user and group mappings */ + snprintf(uidmap, BUFSIZ, "0 %u 1", uid); + snprintf(gidmap, BUFSIZ, "0 %u 1", gid); + if (write_file("/proc/self/uid_map", uidmap) || + write_file("/proc/self/setgroups", "deny") || + write_file("/proc/self/gid_map", gidmap)) { + warn("Couldn't configure user mappings"); + } + if (argc == 0) { arg.exe = getenv("SHELL"); if (!arg.exe) diff --git a/pasta.h b/pasta.h index 02df1f6..a8b9893 100644 --- a/pasta.h +++ b/pasta.h @@ -7,7 +7,8 @@ #define PASTA_H void pasta_open_ns(struct ctx *c, const char *netns); -void pasta_start_ns(struct ctx *c, int argc, char *argv[]); +void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, + int argc, char *argv[]); void pasta_ns_conf(struct ctx *c); void pasta_child_handler(int signal); int pasta_netns_quit_init(struct ctx *c); -- 2.37.3
On Tue, 11 Oct 2022 16:40:17 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:When in passt mode, or pasta mode spawning a command, we create a userns for ourselves. This is used both to isolate the pasta/passt process itself and to run the spawned command, if any. Since eed17a47 "Handle userns isolation and dropping root at the same time" we've handled both cases the same, configuring the UID and GID mappings in the new userns to map whichever UID we're running as to root within the userns. This mapping is desirable when spawning a shell or other command, so that the user gets a root shell with reasonably clear abilities within the userns and netns. It's not necessarily essential, though. When not spawning a shell, it doesn't really have any purpose: passt itself doesn't need to be root and can operate fine with an unmapped user (using some of the capabilities we get when entering the userns instead). Configuring the uid_map can cause problems if passt is running with any capabilities in the initial namespace, such as CAP_NET_BIND_SERVICE to allow it to forward low ports. In this case the kernel makes files in /proc/pid owned by root rather than the starting user to prevent the user from interfering with the operation of the capability-enhanced process. This includes uid_map meaning we are not able to write to it. Whether this behaviour is correct in the kernel is debatable, but in any case we might as well avoid problems by only initializing the user mappings when we really want them. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- conf.c | 3 ++- isolation.c | 13 ------------- pasta.c | 16 ++++++++++++++-- pasta.h | 3 ++- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/conf.c b/conf.c index 1537dbf..b7661b6 100644 --- a/conf.c +++ b/conf.c @@ -1478,7 +1478,8 @@ void conf(struct ctx *c, int argc, char **argv) if (*netns) { pasta_open_ns(c, netns); } else { - pasta_start_ns(c, argc - optind, argv + optind); + pasta_start_ns(c, uid, gid, + argc - optind, argv + optind); } } diff --git a/isolation.c b/isolation.c index e1a024d..b94226d 100644 --- a/isolation.c +++ b/isolation.c @@ -207,9 +207,6 @@ void isolate_initial(void) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { - char uidmap[BUFSIZ]; - char gidmap[BUFSIZ]; - /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { /* If we don't have CAP_SETGID, this will EPERM */ @@ -261,16 +258,6 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) err("Couldn't create user namespace: %s", strerror(errno)); exit(EXIT_FAILURE); } - - /* Configure user and group mappings */ - snprintf(uidmap, BUFSIZ, "0 %u 1", uid); - snprintf(gidmap, BUFSIZ, "0 %u 1", gid); - - if (write_file("/proc/self/uid_map", uidmap) || - write_file("/proc/self/setgroups", "deny") || - write_file("/proc/self/gid_map", gidmap)) { - warn("Couldn't configure user namespace"); - } } /** diff --git a/pasta.c b/pasta.c index 0ab2fe4..9666fed 100644 --- a/pasta.c +++ b/pasta.c @@ -166,7 +166,6 @@ static int pasta_setup_ns(void *arg) { const struct pasta_setup_ns_arg *a = (const struct pasta_setup_ns_arg *)arg; -Unrelated.if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) warn("Cannot set ping_group_range, ICMP requests might fail"); @@ -179,16 +178,20 @@ static int pasta_setup_ns(void *arg) /** * pasta_start_ns() - Fork command in new namespace if target ns is not given * @c: Execution context + * @uid: UID we're running as in the init namespace + * @gid: GID we're running as in the init namespace * @argc: Number of arguments for spawned command * @argv: Command to spawn and arguments */ -void pasta_start_ns(struct ctx *c, int argc, char *argv[]) +void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, + int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { .exe = argv[0], .argv = argv, }; char *sh_argv[] = { NULL, NULL }; + char uidmap[BUFSIZ], gidmap[BUFSIZ];Should go at the beginning for the usual semi-silly reason.char ns_fn_stack[NS_FN_STACK_SIZE]; char sh_arg0[PATH_MAX + 1]; @@ -196,7 +199,16 @@ void pasta_start_ns(struct ctx *c, int argc, char *argv[]) if (!c->debug) c->quiet = 1; + /* Configure user and group mappings */ + snprintf(uidmap, BUFSIZ, "0 %u 1", uid); + snprintf(gidmap, BUFSIZ, "0 %u 1", gid); + if (write_file("/proc/self/uid_map", uidmap) || + write_file("/proc/self/setgroups", "deny") || + write_file("/proc/self/gid_map", gidmap)) { + warn("Couldn't configure user mappings"); + } +Excess whitespace.if (argc == 0) { arg.exe = getenv("SHELL"); if (!arg.exe) diff --git a/pasta.h b/pasta.h index 02df1f6..a8b9893 100644 --- a/pasta.h +++ b/pasta.h @@ -7,7 +7,8 @@ #define PASTA_H void pasta_open_ns(struct ctx *c, const char *netns); -void pasta_start_ns(struct ctx *c, int argc, char *argv[]); +void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, + int argc, char *argv[]); void pasta_ns_conf(struct ctx *c); void pasta_child_handler(int signal); int pasta_netns_quit_init(struct ctx *c);-- Stefano
On Thu, Oct 13, 2022 at 04:18:29AM +0200, Stefano Brivio wrote:On Tue, 11 Oct 2022 16:40:17 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Removed.When in passt mode, or pasta mode spawning a command, we create a userns for ourselves. This is used both to isolate the pasta/passt process itself and to run the spawned command, if any. Since eed17a47 "Handle userns isolation and dropping root at the same time" we've handled both cases the same, configuring the UID and GID mappings in the new userns to map whichever UID we're running as to root within the userns. This mapping is desirable when spawning a shell or other command, so that the user gets a root shell with reasonably clear abilities within the userns and netns. It's not necessarily essential, though. When not spawning a shell, it doesn't really have any purpose: passt itself doesn't need to be root and can operate fine with an unmapped user (using some of the capabilities we get when entering the userns instead). Configuring the uid_map can cause problems if passt is running with any capabilities in the initial namespace, such as CAP_NET_BIND_SERVICE to allow it to forward low ports. In this case the kernel makes files in /proc/pid owned by root rather than the starting user to prevent the user from interfering with the operation of the capability-enhanced process. This includes uid_map meaning we are not able to write to it. Whether this behaviour is correct in the kernel is debatable, but in any case we might as well avoid problems by only initializing the user mappings when we really want them. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- conf.c | 3 ++- isolation.c | 13 ------------- pasta.c | 16 ++++++++++++++-- pasta.h | 3 ++- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/conf.c b/conf.c index 1537dbf..b7661b6 100644 --- a/conf.c +++ b/conf.c @@ -1478,7 +1478,8 @@ void conf(struct ctx *c, int argc, char **argv) if (*netns) { pasta_open_ns(c, netns); } else { - pasta_start_ns(c, argc - optind, argv + optind); + pasta_start_ns(c, uid, gid, + argc - optind, argv + optind); } } diff --git a/isolation.c b/isolation.c index e1a024d..b94226d 100644 --- a/isolation.c +++ b/isolation.c @@ -207,9 +207,6 @@ void isolate_initial(void) */ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) { - char uidmap[BUFSIZ]; - char gidmap[BUFSIZ]; - /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { /* If we don't have CAP_SETGID, this will EPERM */ @@ -261,16 +258,6 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) err("Couldn't create user namespace: %s", strerror(errno)); exit(EXIT_FAILURE); } - - /* Configure user and group mappings */ - snprintf(uidmap, BUFSIZ, "0 %u 1", uid); - snprintf(gidmap, BUFSIZ, "0 %u 1", gid); - - if (write_file("/proc/self/uid_map", uidmap) || - write_file("/proc/self/setgroups", "deny") || - write_file("/proc/self/gid_map", gidmap)) { - warn("Couldn't configure user namespace"); - } } /** diff --git a/pasta.c b/pasta.c index 0ab2fe4..9666fed 100644 --- a/pasta.c +++ b/pasta.c @@ -166,7 +166,6 @@ static int pasta_setup_ns(void *arg) { const struct pasta_setup_ns_arg *a = (const struct pasta_setup_ns_arg *)arg; -Unrelated.Done.if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) warn("Cannot set ping_group_range, ICMP requests might fail"); @@ -179,16 +178,20 @@ static int pasta_setup_ns(void *arg) /** * pasta_start_ns() - Fork command in new namespace if target ns is not given * @c: Execution context + * @uid: UID we're running as in the init namespace + * @gid: GID we're running as in the init namespace * @argc: Number of arguments for spawned command * @argv: Command to spawn and arguments */ -void pasta_start_ns(struct ctx *c, int argc, char *argv[]) +void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, + int argc, char *argv[]) { struct pasta_setup_ns_arg arg = { .exe = argv[0], .argv = argv, }; char *sh_argv[] = { NULL, NULL }; + char uidmap[BUFSIZ], gidmap[BUFSIZ];Should go at the beginning for the usual semi-silly reason.Fixed.char ns_fn_stack[NS_FN_STACK_SIZE]; char sh_arg0[PATH_MAX + 1]; @@ -196,7 +199,16 @@ void pasta_start_ns(struct ctx *c, int argc, char *argv[]) if (!c->debug) c->quiet = 1; + /* Configure user and group mappings */ + snprintf(uidmap, BUFSIZ, "0 %u 1", uid); + snprintf(gidmap, BUFSIZ, "0 %u 1", gid); + if (write_file("/proc/self/uid_map", uidmap) || + write_file("/proc/self/setgroups", "deny") || + write_file("/proc/self/gid_map", gidmap)) { + warn("Couldn't configure user mappings"); + } +Excess whitespace.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonif (argc == 0) { arg.exe = getenv("SHELL"); if (!arg.exe) diff --git a/pasta.h b/pasta.h index 02df1f6..a8b9893 100644 --- a/pasta.h +++ b/pasta.h @@ -7,7 +7,8 @@ #define PASTA_H void pasta_open_ns(struct ctx *c, const char *netns); -void pasta_start_ns(struct ctx *c, int argc, char *argv[]); +void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, + int argc, char *argv[]); void pasta_ns_conf(struct ctx *c); void pasta_child_handler(int signal); int pasta_netns_quit_init(struct ctx *c);
pasta_setup_ns() no longer has much to do with setting up a namespace. Instead it's really about starting the shell or other command we want to run with pasta connectivity. Rename it and its argument structure to be less misleading. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- pasta.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pasta.c b/pasta.c index 9666fed..23dc6d6 100644 --- a/pasta.c +++ b/pasta.c @@ -147,25 +147,26 @@ void pasta_open_ns(struct ctx *c, const char *netns) } /** - * struct pasta_setup_ns_arg - Argument for pasta_setup_ns() + * struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd() * @exe: Executable to run * @argv: Command and arguments to run */ -struct pasta_setup_ns_arg { +struct pasta_spawn_cmd_arg { const char *exe; char *const *argv; }; /** - * pasta_setup_ns() - Map credentials, enable access to ping sockets, run shell - * @arg: See @pasta_setup_ns_arg + * pasta_spawn_cmd() - Prepare new netns, start command or shell + * @arg: See @pasta_spawn_cmd_arg * * Return: this function never returns */ -static int pasta_setup_ns(void *arg) +static int pasta_spawn_cmd(void *arg) { - const struct pasta_setup_ns_arg *a - = (const struct pasta_setup_ns_arg *)arg; + const struct pasta_spawn_cmd_arg *a + = (const struct pasta_spawn_cmd_arg *)arg; + if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0")) warn("Cannot set ping_group_range, ICMP requests might fail"); @@ -186,7 +187,7 @@ static int pasta_setup_ns(void *arg) void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, int argc, char *argv[]) { - struct pasta_setup_ns_arg arg = { + struct pasta_spawn_cmd_arg arg = { .exe = argv[0], .argv = argv, }; @@ -224,7 +225,7 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid, arg.argv = sh_argv; } - pasta_child_pid = clone(pasta_setup_ns, + pasta_child_pid = clone(pasta_spawn_cmd, ns_fn_stack + sizeof(ns_fn_stack) / 2, CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWUTS, -- 2.37.3
On Tue, 11 Oct 2022 16:40:08 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Our current handling of capabilities isn't quite right. In particular, drop_caps() attempts to remove capabilities from the bounding set, which usually won't work, and even if it does won't have the effect we want. This series corrects that, as well as making some other fixes and cleanups in adjacent code.Other than those entirely formal details I reported (...and I tried hard), this looks great to me, and feels like a relief. :) -- Stefano