The current implementation of drop_caps() doesn't really work because it attempts to drop capabilities from the bounding set. That's not the set that really matters, it's about limiting the abilities of things we might later exec() rather than our own capabilities. It also requires CAP_SETPCAP which we won't usually have. Replace it with a new version which uses setcap(2) to drop capabilities from the effective and permitted sets. For now we leave the inheritable set as is, since we don't want to preclude the user from passing inheritable capabilities to the command spawed by pasta. Correctly dropping caps reveals that we were relying on some capabilities we'd supposedly dropped. Re-divide the dropping of capabilities between isolate_initial(), isolate_user() and isolate_prefork() to make this work. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- conf.c | 2 +- isolation.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++----- isolation.h | 3 +- 3 files changed, 92 insertions(+), 11 deletions(-) diff --git a/conf.c b/conf.c index 1537dbf..0be887e 100644 --- a/conf.c +++ b/conf.c @@ -1469,7 +1469,7 @@ void conf(struct ctx *c, int argc, char **argv) usage(argv[0]); } - isolate_user(uid, gid, !netns_only, userns); + isolate_user(uid, gid, !netns_only, userns, c->mode); if (c->pasta_conf_ns) c->no_ra = 1; diff --git a/isolation.c b/isolation.c index 211c26f..6d87dec 100644 --- a/isolation.c +++ b/isolation.c @@ -86,18 +86,37 @@ #include "passt.h" #include "isolation.h" +#define CAP_VERSION _LINUX_CAPABILITY_VERSION_3 +#define CAP_WORDS _LINUX_CAPABILITY_U32S_3 + /** - * drop_caps() - Drop capabilities we might have except for CAP_NET_BIND_SERVICE + * drop_caps_ep_except() - Drop capabilities from effective & permitted sets + * @keep: Capabilities to keep */ -static void drop_caps(void) +static void drop_caps_ep_except(uint64_t keep) { + struct __user_cap_header_struct hdr = { + .version = CAP_VERSION, + .pid = 0, + }; + struct __user_cap_data_struct data[CAP_WORDS]; int i; - for (i = 0; i < 64; i++) { - if (i == CAP_NET_BIND_SERVICE) - continue; + if (syscall(SYS_capget, &hdr, data)) { + err("Couldn't get current capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); + } - prctl(PR_CAPBSET_DROP, i, 0, 0, 0); + for (i = 0; i < CAP_WORDS; i++) { + uint32_t mask = keep >> (32 * i); + + data[i].effective &= mask; + data[i].permitted &= mask; + } + + if (syscall(SYS_capset, &hdr, data)) { + err("Couldn't drop capabilities: %s", strerror(errno)); + exit(EXIT_FAILURE); } } @@ -111,7 +130,25 @@ static void drop_caps(void) */ void isolate_initial(void) { - drop_caps(); + /* We want to keep CAP_NET_BIND_SERVICE in the initial + * namespace if we have it, so that we can forward low ports + * into the guest/namespace + * + * We have to keep CAP_SETUID and CAP_SETGID at this stage, so + * that we can switch user away from root. + * + * We have to keep some capabilities for the --netns-only case: + * - CAP_SYS_ADMIN, so that we can setns() to the netns. + * - Keep CAP_NET_ADMIN, so that we can configure interfaces + * + * It's debatable whether it's useful to drop caps when we + * retain SETUID and SYS_ADMIN, but we might as well. We drop + * further capabilites in isolate_user() and + * isolate_prefork(). + */ + drop_caps_ep_except(BIT(CAP_NET_BIND_SERVICE) | + BIT(CAP_SETUID) | BIT(CAP_SETGID) | + BIT(CAP_SYS_ADMIN) | BIT(CAP_NET_ADMIN)); } /** @@ -120,6 +157,7 @@ void isolate_initial(void) * @gid: Group ID to run as (in original userns) * @use_userns: Whether to join or create a userns * @userns: userns path to enter, may be empty + * @mode: Mode (passt or pasta) * * Should: * - set our final UID and GID @@ -127,8 +165,11 @@ void isolate_initial(void) * Mustn't: * - remove filesystem access (we need that for further setup) */ -void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) +void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns, + enum passt_modes mode) { + uint64_t ns_caps = 0; + /* First set our UID & GID in the original namespace */ if (setgroups(0, NULL)) { /* If we don't have CAP_SETGID, this will EPERM */ @@ -166,6 +207,7 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) } close(ufd); + } else if (use_userns) { /* Create and join a new userns */ char uidmap[BUFSIZ]; char gidmap[BUFSIZ]; @@ -185,6 +227,31 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) warn("Couldn't configure user namespace"); } } + + /* Joining a new userns gives us full capabilities; drop the + * ones we don't need. With --netns-only we haven't changed + * userns but we can drop more capabilities now than at + * isolate_initial() + */ + /* Keep CAP_SYS_ADMIN, so we can unshare() further in + * isolate_prefork(), pasta also needs it to setns() into the + * netns + */ + ns_caps |= BIT(CAP_SYS_ADMIN); + if (mode == MODE_PASTA) { + /* Keep CAP_NET_ADMIN, so we can configure the if */ + ns_caps |= BIT(CAP_NET_ADMIN); + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= BIT(CAP_NET_BIND_SERVICE); + /* Keep CAP_SYS_PTRACE to join the netns of an + * existing process */ + if (*userns || !use_userns) + ns_caps |= BIT(CAP_SYS_PTRACE); + } + + drop_caps_ep_except(ns_caps); } /** @@ -203,6 +270,7 @@ void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns) int isolate_prefork(struct ctx *c) { int flags = CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUTS; + uint64_t ns_caps = 0; /* If we run in foreground, we have no chance to actually move to a new * PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody @@ -243,7 +311,19 @@ int isolate_prefork(struct ctx *c) return -errno; } - drop_caps(); /* Relative to the new user namespace this time. */ + /* Now that initialization is more-or-less complete, we can + * drop further capabilities + */ + if (c->mode == MODE_PASTA) { + /* Keep CAP_SYS_ADMIN, so we can enter the netns */ + ns_caps |= BIT(CAP_SYS_ADMIN); + /* Keep CAP_NET_BIND_SERVICE, so we can splice + * outbound connections to low port numbers + */ + ns_caps |= BIT(CAP_NET_BIND_SERVICE); + } + + drop_caps_ep_except(ns_caps); return 0; } diff --git a/isolation.h b/isolation.h index 70a38f9..54c60f6 100644 --- a/isolation.h +++ b/isolation.h @@ -8,7 +8,8 @@ #define ISOLATION_H void isolate_initial(void); -void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns); +void isolate_user(uid_t uid, gid_t gid, bool use_userns, const char *userns, + enum passt_modes mode); int isolate_prefork(struct ctx *c); void isolate_postfork(const struct ctx *c); -- 2.37.3