This series adds a proof-of-concept for Kata Containers using passt as an endpoint for user-mode networking, and related prerequisites. Stefano Brivio (4): passt, tap: Daemonise once socket is ready without waiting for connection qemu: Rebase patch for UNIX domain socket support to latest upstream libvirt, qemu: Move patches to new directory, contrib contrib: Introduce PoC for Kata Containers with user-mode networking ...gent-Add-passt-networking-model-and-.patch | 462 ++++++++++++++++++ contrib/kata-containers/README.md | 302 ++++++++++++ ...upport-for-UNIX-domain-socket-as-qem.patch | 0 ...NIX-domain-sockets-to-be-used-as-net.patch | 63 +-- ...e-EINVAL-on-netdev-socket-connection.patch | 0 passt.c | 6 +- tap.c | 62 ++- tap.h | 2 +- tcp.c | 2 +- 9 files changed, 833 insertions(+), 66 deletions(-) create mode 100644 contrib/kata-containers/0001-virtcontainers-agent-Add-passt-networking-model-and-.patch create mode 100644 contrib/kata-containers/README.md rename {libvirt => contrib/libvirt}/0001-conf-Introduce-support-for-UNIX-domain-socket-as-qem.patch (100%) rename {qemu => contrib/qemu}/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch (71%) rename {qemu => contrib/qemu}/0002-net-Don-t-ignore-EINVAL-on-netdev-socket-connection.patch (100%) -- 2.33.0
The existing behaviour is not really practical: an automated agent in charge of starting both qemu and passt would need to fork itself to start passt, because passt won't fork to background until qemu connects, and the agent needs to unblock to start qemu. Instead of waiting for a connection to daemonise, do it right away as soon as a socket is available: that can be considered an initialised state already. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- passt.c | 6 ++++-- tap.c | 62 +++++++++++++++++++++++++++++++++++++++++---------------- tap.h | 2 +- tcp.c | 2 +- 4 files changed, 51 insertions(+), 21 deletions(-) diff --git a/passt.c b/passt.c index 3c9fb90..a8bb88e 100644 --- a/passt.c +++ b/passt.c @@ -345,6 +345,7 @@ int main(int argc, char **argv) } sock_probe_mem(&c); + c.fd_tap = c.fd_tap_listen = -1; tap_sock_init(&c); clock_gettime(CLOCK_MONOTONIC, &now); @@ -387,9 +388,10 @@ loop: for (i = 0; i < nfds; i++) { union epoll_ref ref = *((union epoll_ref *)&events[i].data.u64); + int fd = events[i].data.fd; - if (events[i].data.fd == c.fd_tap) - tap_handler(&c, events[i].events, &now); + if (fd == c.fd_tap || fd == c.fd_tap_listen) + tap_handler(&c, fd, events[i].events, &now); else sock_handler(&c, ref, events[i].events, &now); } diff --git a/tap.c b/tap.c index 2bf6f71..34a5705 100644 --- a/tap.c +++ b/tap.c @@ -769,7 +769,7 @@ restart: } /** - * tap_sock_init_unix() - Create and bind AF_UNIX socket, wait for connection + * tap_sock_init_unix() - Create and bind AF_UNIX socket, listen for connection * @c: Execution context * * #syscalls:passt unlink|unlinkat @@ -777,19 +777,21 @@ restart: static void tap_sock_init_unix(struct ctx *c) { int fd = socket(AF_UNIX, SOCK_STREAM, 0), ex; + struct epoll_event ev = { 0 }; struct sockaddr_un addr = { .sun_family = AF_UNIX, }; - int i, ret, v = INT_MAX / 2; + int i, ret; - if (c->fd_tap_listen) + if (c->fd_tap_listen != -1) { + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap_listen, &ev); close(c->fd_tap_listen); + } if (fd < 0) { perror("UNIX socket"); exit(EXIT_FAILURE); } - c->fd_tap_listen = fd; for (i = 1; i < UNIX_SOCK_MAX; i++) { char *path = addr.sun_path; @@ -836,6 +838,10 @@ static void tap_sock_init_unix(struct ctx *c) listen(fd, 0); + ev.data.fd = c->fd_tap_listen = fd; + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev); + info("You can now start qrap:"); info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); info("or directly qemu, patched with:"); @@ -843,14 +849,32 @@ static void tap_sock_init_unix(struct ctx *c) info("as follows:"); info(" kvm ... -net socket,connect=%s -net nic,model=virtio", addr.sun_path); +} - c->fd_tap = accept(fd, NULL, NULL); +/** + * tap_sock_accept_unix() - Accept connection on listening socket + * @c: Execution context + */ +static void tap_sock_accept_unix(struct ctx *c) +{ + struct epoll_event ev = { 0 }; + int v = INT_MAX / 2; + + c->fd_tap = accept(c->fd_tap_listen, NULL, NULL); + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap_listen, &ev); + close(c->fd_tap_listen); + c->fd_tap_listen = -1; if (!c->low_rmem) setsockopt(c->fd_tap, SOL_SOCKET, SO_RCVBUF, &v, sizeof(v)); if (!c->low_wmem) setsockopt(c->fd_tap, SOL_SOCKET, SO_SNDBUF, &v, sizeof(v)); + + ev.data.fd = c->fd_tap; + ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } static int tun_ns_fd = -1; @@ -885,6 +909,8 @@ static int tap_ns_tun(void *arg) */ static void tap_sock_init_tun(struct ctx *c) { + struct epoll_event ev = { 0 }; + NS_CALL(tap_ns_tun, c); if (tun_ns_fd == -1) { err("Failed to open tun socket in namespace"); @@ -896,6 +922,10 @@ static void tap_sock_init_tun(struct ctx *c) pcap_init(c, c->pasta_netns_fd); c->fd_tap = tun_ns_fd; + + ev.data.fd = c->fd_tap; + ev.events = EPOLLIN | EPOLLRDHUP; + epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } /** @@ -904,33 +934,31 @@ static void tap_sock_init_tun(struct ctx *c) */ void tap_sock_init(struct ctx *c) { - struct epoll_event ev = { 0 }; - - if (c->fd_tap) { + if (c->fd_tap != -1) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); close(c->fd_tap); } - if (c->mode == MODE_PASST) { + if (c->mode == MODE_PASST) tap_sock_init_unix(c); - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; - } else { + else tap_sock_init_tun(c); - ev.events = EPOLLIN | EPOLLRDHUP; - } - - ev.data.fd = c->fd_tap; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); } /** * tap_handler() - Packet handler for AF_UNIX or tuntap file descriptor * @c: Execution context + * @fd: File descriptor where event occurred * @events: epoll events * @now: Current timestamp */ -void tap_handler(struct ctx *c, uint32_t events, struct timespec *now) +void tap_handler(struct ctx *c, int fd, uint32_t events, struct timespec *now) { + if (fd == c->fd_tap_listen && events == EPOLLIN) { + tap_sock_accept_unix(c); + return; + } + if (events & (EPOLLRDHUP | EPOLLHUP | EPOLLERR)) goto fail; diff --git a/tap.h b/tap.h index c437f5f..8942fcf 100644 --- a/tap.h +++ b/tap.h @@ -6,5 +6,5 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, char *in, size_t len, uint32_t flow); int tap_send(struct ctx *c, void *data, size_t len, int vnet_pre); -void tap_handler(struct ctx *c, uint32_t events, struct timespec *now); +void tap_handler(struct ctx *c, int fd, uint32_t events, struct timespec *now); void tap_sock_init(struct ctx *c); diff --git a/tcp.c b/tcp.c index 01f09e9..7122898 100644 --- a/tcp.c +++ b/tcp.c @@ -2319,7 +2319,7 @@ recvmsg: if (errno == EAGAIN || errno == EWOULDBLOCK) return 0; - tap_handler(c, EPOLLERR, now); + tap_handler(c, c->fd_tap, EPOLLERR, now); } i--; -- 2.33.0
Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- ...NIX-domain-sockets-to-be-used-as-net.patch | 63 ++++++------------- 1 file changed, 18 insertions(+), 45 deletions(-) diff --git a/qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch b/qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch index 23cda71..9e71f88 100644 --- a/qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch +++ b/qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch @@ -1,7 +1,6 @@ -From ba51349d353f11e05c6341a7e065f2ade3874c68 Mon Sep 17 00:00:00 2001 -Message-Id: <ba51349d353f11e05c6341a7e065f2ade3874c68.1619091389.git.sbrivio(a)redhat.com> +From 83c3f76b8fe6b4a6bb45dcf5cfad65ec6f98a10e Mon Sep 17 00:00:00 2001 From: Stefano Brivio <sbrivio(a)redhat.com> -Date: Wed, 21 Apr 2021 18:51:18 +0200 +Date: Wed, 26 Jan 2022 16:45:15 +0100 Subject: [PATCH 1/2] net: Allow also UNIX domain sockets to be used as -netdev socket @@ -10,18 +9,17 @@ and the adaptation is trivial. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- -SPDX-FileCopyrightText: 2020-2021 Red Hat GmbH <sbrivio(a)redhat.com> +SPDX-FileCopyrightText: 2020-2022 Red Hat GmbH <sbrivio(a)redhat.com> SPDX-License-Identifier: AGPL-3.0-or-later - net/socket.c | 106 ++++++++++++++++++++++++++++++++++++++++-------- - qemu-options.hx | 12 +++--- - 2 files changed, 94 insertions(+), 24 deletions(-) + net/socket.c | 106 ++++++++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 87 insertions(+), 19 deletions(-) diff --git a/net/socket.c b/net/socket.c -index 15b410e8d825..aadd11dae2b3 100644 +index 2e5f3ac923..b901e22836 100644 --- a/net/socket.c +++ b/net/socket.c -@@ -511,26 +511,60 @@ static int net_socket_listen_init(NetClientState *peer, +@@ -511,26 +511,59 @@ static int net_socket_listen_init(NetClientState *peer, { NetClientState *nc; NetSocketState *s; @@ -41,6 +39,7 @@ index 15b410e8d825..aadd11dae2b3 100644 - if (parse_host_port(&saddr, host_str, errp) < 0) { - return -1; +- } + pf = PF_INET; + saddr_size = sizeof(*saddr_in); +#ifndef WIN32 @@ -56,16 +55,15 @@ index 15b410e8d825..aadd11dae2b3 100644 + error_setg_errno(errp, errno, "path provided is not a socket"); + return -1; + } -+ + +- fd = qemu_socket(PF_INET, SOCK_STREAM, 0); + saddr_un->sun_family = PF_UNIX; + strncpy(saddr_un->sun_path, host_str, sizeof(saddr_un->sun_path)); + + pf = PF_UNIX; + saddr_size = sizeof(*saddr_un); - } ++ } +#endif /* !WIN32 */ - -- fd = qemu_socket(PF_INET, SOCK_STREAM, 0); + fd = qemu_socket(pf, SOCK_STREAM, 0); if (fd < 0) { error_setg_errno(errp, errno, "can't create stream socket"); @@ -91,7 +89,7 @@ index 15b410e8d825..aadd11dae2b3 100644 closesocket(fd); return -1; } -@@ -559,14 +593,44 @@ static int net_socket_connect_init(NetClientState *peer, +@@ -559,14 +592,43 @@ static int net_socket_connect_init(NetClientState *peer, Error **errp) { NetSocketState *s; @@ -103,15 +101,14 @@ index 15b410e8d825..aadd11dae2b3 100644 + struct sockaddr_in *saddr_in = (struct sockaddr_in *)&saddr; +#ifndef WIN32 + struct sockaddr_un *saddr_un = (struct sockaddr_un *)&saddr; - -- if (parse_host_port(&saddr, host_str, errp) < 0) { -- return -1; + + if (strchr(host_str, ':')) { +#endif + if (parse_host_port(saddr_in, host_str, errp) < 0) + return -1; -+ + +- if (parse_host_port(&saddr, host_str, errp) < 0) { +- return -1; + pf = PF_INET; + saddr_size = sizeof(*saddr_in); +#ifndef WIN32 @@ -141,7 +138,7 @@ index 15b410e8d825..aadd11dae2b3 100644 if (fd < 0) { error_setg_errno(errp, errno, "can't create stream socket"); return -1; -@@ -575,7 +639,7 @@ static int net_socket_connect_init(NetClientState *peer, +@@ -575,7 +637,7 @@ static int net_socket_connect_init(NetClientState *peer, connected = 0; for(;;) { @@ -150,7 +147,7 @@ index 15b410e8d825..aadd11dae2b3 100644 if (ret < 0) { if (errno == EINTR || errno == EWOULDBLOCK) { /* continue */ -@@ -598,9 +662,15 @@ static int net_socket_connect_init(NetClientState *peer, +@@ -597,9 +659,15 @@ static int net_socket_connect_init(NetClientState *peer, return -1; } @@ -169,30 +166,6 @@ index 15b410e8d825..aadd11dae2b3 100644 return 0; } -diff --git a/qemu-options.hx b/qemu-options.hx -index fd21002bd61d..625a31dcdbc8 100644 ---- a/qemu-options.hx -+++ b/qemu-options.hx -@@ -2847,13 +2847,13 @@ SRST - #connect a TAP device to bridge qemubr0 - |qemu_system| linux.img -netdev bridge,br=qemubr0,id=n1 -device virtio-net,netdev=n1 - --``-netdev socket,id=id[,fd=h][,listen=[host]:port][,connect=host:port]`` -+``-netdev socket,id=id[,fd=h][,listen=[host]:port|path][,connect=host:port|path]`` - This host network backend can be used to connect the guest's network -- to another QEMU virtual machine using a TCP socket connection. If -- ``listen`` is specified, QEMU waits for incoming connections on port -- (host is optional). ``connect`` is used to connect to another QEMU -- instance using the ``listen`` option. ``fd``\ =h specifies an -- already opened TCP socket. -+ to another QEMU virtual machine using a TCP or a UNIX domain socket -+ connection. If ``listen`` is specified, QEMU waits for incoming -+ connections on port (host is optional), or on path. ``connect`` is used -+ to connect to another QEMU instance using the ``listen`` option. -+ ``fd``\ =h specifies an already opened TCP or UNIX domain socket. - - Example: - -- -2.29.2 +2.28.0 -- 2.33.0
I'm about to add a new adaptation carrying out-of-tree patches for a Kata Containers PoC -- move the existing out-of-tree patches to their own directory to keep things easy to find in the main one. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- ...001-conf-Introduce-support-for-UNIX-domain-socket-as-qem.patch | 0 ...001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch | 0 ...0002-net-Don-t-ignore-EINVAL-on-netdev-socket-connection.patch | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {libvirt => contrib/libvirt}/0001-conf-Introduce-support-for-UNIX-domain-socket-as-qem.patch (100%) rename {qemu => contrib/qemu}/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch (100%) rename {qemu => contrib/qemu}/0002-net-Don-t-ignore-EINVAL-on-netdev-socket-connection.patch (100%) diff --git a/libvirt/0001-conf-Introduce-support-for-UNIX-domain-socket-as-qem.patch b/contrib/libvirt/0001-conf-Introduce-support-for-UNIX-domain-socket-as-qem.patch similarity index 100% rename from libvirt/0001-conf-Introduce-support-for-UNIX-domain-socket-as-qem.patch rename to contrib/libvirt/0001-conf-Introduce-support-for-UNIX-domain-socket-as-qem.patch diff --git a/qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch b/contrib/qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch similarity index 100% rename from qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch rename to contrib/qemu/0001-net-Allow-also-UNIX-domain-sockets-to-be-used-as-net.patch diff --git a/qemu/0002-net-Don-t-ignore-EINVAL-on-netdev-socket-connection.patch b/contrib/qemu/0002-net-Don-t-ignore-EINVAL-on-netdev-socket-connection.patch similarity index 100% rename from qemu/0002-net-Don-t-ignore-EINVAL-on-netdev-socket-connection.patch rename to contrib/qemu/0002-net-Don-t-ignore-EINVAL-on-netdev-socket-connection.patch -- 2.33.0
passt can be used to implement user-mode networking for the Kata Containers runtime, so that networking setup doesn't need elevated privileges or capabilities. This commit adds the patch for Kata Containers runtime and agent to support passt as networking model and endpoint, and some basic documentation. See contrib/kata-containers/README.md for more details and setup steps. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- ...gent-Add-passt-networking-model-and-.patch | 462 ++++++++++++++++++ contrib/kata-containers/README.md | 302 ++++++++++++ 2 files changed, 764 insertions(+) create mode 100644 contrib/kata-containers/0001-virtcontainers-agent-Add-passt-networking-model-and-.patch create mode 100644 contrib/kata-containers/README.md diff --git a/contrib/kata-containers/0001-virtcontainers-agent-Add-passt-networking-model-and-.patch b/contrib/kata-containers/0001-virtcontainers-agent-Add-passt-networking-model-and-.patch new file mode 100644 index 0000000..e0dffa5 --- /dev/null +++ b/contrib/kata-containers/0001-virtcontainers-agent-Add-passt-networking-model-and-.patch @@ -0,0 +1,462 @@ +From e1b250fc0b5e377285db5d90476fdd2d63501191 Mon Sep 17 00:00:00 2001 +From: Stefano Brivio <sbrivio(a)redhat.com> +Date: Fri, 28 Jan 2022 01:09:23 +0100 +Subject: [PATCH] virtcontainers, agent: Add passt networking model and + endpoint + +This implements a draft support for user-mode networking using +passt (https://passt.top), the corresponding networking model +can be enabled via: + + internetworking_model=passt + +in the [runtime] section of the TOML configuration file. + +The networking endpoint does essentially nothing, other than +starting and stopping passt as needed: no interfaces are configured, +qemu connects to passt via UNIX domain socket, the corresponding +command line option is appended if this networking model is +selected. + +The passt instance started by the endpoint take cares of forwarding +traffic back and forth, translating between the L2 frames qemu-side +and native L4 sockets on the host. + +This network setup doesn't need elevated privileges or any kind of +capability. However, this patch doesn't implement privileges drop +as the containerd interface allows only runtimes running as the +same user to connect to its own UNIX domain socket interface, +typically root (at least in the case of CRI-O), and root privileges +might anyway be needed for other purposes (block devices, etc.) + +Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> +--- +SPDX-FileCopyrightText: 2021-2022 Red Hat GmbH <sbrivio(a)redhat.com> +SPDX-License-Identifier: Apache-2.0 + + src/agent/src/netlink.rs | 3 +- + .../kata-containers/govmm/qemu/qemu.go | 23 ++- + src/runtime/virtcontainers/endpoint.go | 7 + + src/runtime/virtcontainers/network.go | 24 +++ + src/runtime/virtcontainers/passt_endpoint.go | 156 ++++++++++++++++++ + .../virtcontainers/persist/api/network.go | 5 + + src/runtime/virtcontainers/qemu_arch_base.go | 11 ++ + 7 files changed, 226 insertions(+), 3 deletions(-) + create mode 100644 src/runtime/virtcontainers/passt_endpoint.go + +diff --git a/src/agent/src/netlink.rs b/src/agent/src/netlink.rs +index ed071b60..34c6df96 100644 +--- a/src/agent/src/netlink.rs ++++ b/src/agent/src/netlink.rs +@@ -312,7 +312,8 @@ impl Handle { + let list = a.iter().chain(&b); + + for route in list { +- let link = self.find_link(LinkFilter::Name(&route.device)).await?; ++ // TODO: "eth0" hardcoded for passt networking model ++ let link = self.find_link(LinkFilter::Name("eth0")).await?; + + const MAIN_TABLE: u8 = packet::constants::RT_TABLE_MAIN; + const UNICAST: u8 = packet::constants::RTN_UNICAST; +diff --git a/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go b/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go +index e57a4b26..1756bdfd 100644 +--- a/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go ++++ b/src/runtime/vendor/github.com/kata-containers/govmm/qemu/qemu.go +@@ -682,6 +682,8 @@ const ( + + // VHOSTUSER is a vhost-user port (socket) + VHOSTUSER NetDeviceType = "vhostuser" ++ ++ PASST NetDeviceType = "passt" + ) + + // QemuNetdevParam converts to the QEMU -netdev parameter notation +@@ -709,6 +711,8 @@ func (n NetDeviceType) QemuNetdevParam(netdev *NetDevice, config *Config) string + log.Fatal("vhost-user devices are not supported on IBM Z") + } + return "vhost-user" // -netdev type=vhost-user (no device) ++ case PASST: ++ return "socket" // -netdev type=socket,connect=... + default: + return "" + +@@ -742,6 +746,8 @@ func (n NetDeviceType) QemuDeviceParam(netdev *NetDevice, config *Config) Device + log.Fatal("vhost-user devices are not supported on IBM Z") + } + return "" // -netdev type=vhost-user (no device) ++ case PASST: ++ device = "virtio-net" + default: + return "" + } +@@ -806,6 +812,8 @@ type NetDevice struct { + + // Transport is the virtio transport for this device. + Transport VirtioTransport ++ ++ SocketPath string + } + + // VirtioNetTransport is a map of the virtio-net device name that corresponds +@@ -818,6 +826,10 @@ var VirtioNetTransport = map[VirtioTransport]string{ + + // Valid returns true if the NetDevice structure is valid and complete. + func (netdev NetDevice) Valid() bool { ++ if netdev.Type == PASST { ++ return true ++ } ++ + if netdev.ID == "" || netdev.IFName == "" { + return false + } +@@ -867,7 +879,9 @@ func (netdev NetDevice) QemuDeviceParams(config *Config) []string { + + deviceParams = append(deviceParams, fmt.Sprintf("driver=%s", driver)) + deviceParams = append(deviceParams, fmt.Sprintf("netdev=%s", netdev.ID)) +- deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", netdev.MACAddress)) ++ if netdev.MACAddress != "" { ++ deviceParams = append(deviceParams, fmt.Sprintf("mac=%s", netdev.MACAddress)) ++ } + + if netdev.Bus != "" { + deviceParams = append(deviceParams, fmt.Sprintf("bus=%s", netdev.Bus)) +@@ -937,7 +951,12 @@ func (netdev NetDevice) QemuNetdevParams(config *Config) []string { + netdevParams = append(netdevParams, fmt.Sprintf("fds=%s", strings.Join(fdParams, ":"))) + + } else { +- netdevParams = append(netdevParams, fmt.Sprintf("ifname=%s", netdev.IFName)) ++ if netdev.IFName != "" { ++ netdevParams = append(netdevParams, fmt.Sprintf("ifname=%s", netdev.IFName)) ++ } ++ if netdev.SocketPath != "" { ++ netdevParams = append(netdevParams, fmt.Sprintf("connect=%s", netdev.SocketPath)) ++ } + if netdev.DownScript != "" { + netdevParams = append(netdevParams, fmt.Sprintf("downscript=%s", netdev.DownScript)) + } +diff --git a/src/runtime/virtcontainers/endpoint.go b/src/runtime/virtcontainers/endpoint.go +index 7786bb3e..e167304a 100644 +--- a/src/runtime/virtcontainers/endpoint.go ++++ b/src/runtime/virtcontainers/endpoint.go +@@ -65,6 +65,8 @@ const ( + + // IPVlanEndpointType is ipvlan network interface. + IPVlanEndpointType EndpointType = "ipvlan" ++ ++ PasstEndpointType EndpointType = "passt" + ) + + // Set sets an endpoint type based on the input string. +@@ -94,6 +96,9 @@ func (endpointType *EndpointType) Set(value string) error { + case "ipvlan": + *endpointType = IPVlanEndpointType + return nil ++ case "passt": ++ *endpointType = PasstEndpointType ++ return nil + default: + return fmt.Errorf("Unknown endpoint type %s", value) + } +@@ -118,6 +123,8 @@ func (endpointType *EndpointType) String() string { + return string(TuntapEndpointType) + case IPVlanEndpointType: + return string(IPVlanEndpointType) ++ case PasstEndpointType: ++ return string(PasstEndpointType) + default: + return "" + } +diff --git a/src/runtime/virtcontainers/network.go b/src/runtime/virtcontainers/network.go +index e6c681da..2de692fe 100644 +--- a/src/runtime/virtcontainers/network.go ++++ b/src/runtime/virtcontainers/network.go +@@ -57,6 +57,9 @@ const ( + // NetXConnectNoneModel can be used when the VM is in the host network namespace + NetXConnectNoneModel + ++ // passt in namespace connecting hypervisor via host sockets ++ NetXConnectPasstModel ++ + // NetXConnectInvalidModel is the last item to Check valid values by IsValid() + NetXConnectInvalidModel + ) +@@ -73,6 +76,8 @@ const ( + + tcFilterNetModelStr = "tcfilter" + ++ passtNetModelStr = "passt" ++ + noneNetModelStr = "none" + ) + +@@ -85,6 +90,8 @@ func (n *NetInterworkingModel) GetModel() string { + return macvtapNetModelStr + case NetXConnectTCFilterModel: + return tcFilterNetModelStr ++ case NetXConnectPasstModel: ++ return passtNetModelStr + case NetXConnectNoneModel: + return noneNetModelStr + } +@@ -103,6 +110,9 @@ func (n *NetInterworkingModel) SetModel(modelName string) error { + case tcFilterNetModelStr: + *n = NetXConnectTCFilterModel + return nil ++ case passtNetModelStr: ++ *n = NetXConnectPasstModel ++ return nil + case noneNetModelStr: + *n = NetXConnectNoneModel + return nil +@@ -254,6 +264,8 @@ func getLinkForEndpoint(endpoint Endpoint, netHandle *netlink.Handle) (netlink.L + link = &netlink.IPVlan{} + case *TuntapEndpoint: + link = &netlink.Tuntap{} ++ case *PasstEndpoint: ++ return nil, nil + default: + return nil, fmt.Errorf("Unexpected endpointType %s", ep.Type()) + } +@@ -302,6 +314,11 @@ func xConnectVMNetwork(ctx context.Context, endpoint Endpoint, h Hypervisor) err + span, ctx := networkTrace(ctx, "xConnectVMNetwork", endpoint) + defer closeSpan(span, err) + ++ if endpoint.Type() == PasstEndpointType { ++ networkLogger().Info("VM network via passt user-mode networking") ++ return nil ++ } ++ + netPair := endpoint.NetworkPair() + + queues := 0 +@@ -347,6 +364,7 @@ func xDisconnectVMNetwork(ctx context.Context, endpoint Endpoint) error { + err = untapNetworkPair(ctx, endpoint) + case NetXConnectTCFilterModel: + err = removeTCFiltering(ctx, endpoint) ++ case NetXConnectPasstModel: + default: + err = fmt.Errorf("Invalid internetworking model") + } +@@ -1095,6 +1113,12 @@ func createEndpoint(netInfo NetworkInfo, idx int, model NetInterworkingModel, li + // an appropriate EndPoint based on interface type + // This should be a switch + ++ if model == NetXConnectPasstModel { ++ networkLogger().Info("creating passt endpoint") ++ endpoint, err := createPasstNetworkEndpoint(idx) ++ return endpoint, err ++ } ++ + // Check if interface is a physical interface. Do not create + // tap interface/bridge if it is. + isPhysical, err := isPhysicalIface(netInfo.Iface.Name) +diff --git a/src/runtime/virtcontainers/passt_endpoint.go b/src/runtime/virtcontainers/passt_endpoint.go +new file mode 100644 +index 00000000..7f40135a +--- /dev/null ++++ b/src/runtime/virtcontainers/passt_endpoint.go +@@ -0,0 +1,156 @@ ++// SPDX-License-Identifier: Apache-2.0 ++// ++// passt_endpoint.go - passt endpoint for Kata Containers: start and stop passt ++// ++// Copyright (c) 2021-2022 Red Hat GmbH ++// Author: Stefano Brivio <sbrivio(a)redhat.com> ++ ++package virtcontainers ++ ++import ( ++ "context" ++ "fmt" ++ "os" ++ "os/exec" ++ "syscall" ++ ++ persistapi "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/persist/api" ++ vcTypes "github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types" ++) ++ ++type PasstEndpoint struct { ++ EndpointType EndpointType ++ EndpointProperties NetworkInfo ++ PCIPath vcTypes.PciPath ++ PasstPID int ++} ++ ++func createPasstNetworkEndpoint(idx int) (*PasstEndpoint, error) { ++ if idx < 0 { ++ return &PasstEndpoint{}, fmt.Errorf("invalid network endpoint index: %d", idx) ++ } ++ ++ cmd := exec.Command("passt", ++ "-P", fmt.Sprintf("/tmp/kata-passt-%d.pid", idx), ++ "-s", fmt.Sprintf("/tmp/kata-passt-%d.socket", idx)) ++ err := cmd.Run() ++ if err != nil { ++ return &PasstEndpoint{}, fmt.Errorf("passt failed to start: %v", err) ++ } ++ ++ in, err := os.Open(fmt.Sprintf("/tmp/kata-passt-%d.pid", idx)) ++ if err != nil { ++ return &PasstEndpoint{}, fmt.Errorf("Failed to read passt PID: %v", err) ++ } ++ defer in.Close() ++ ++ var pid int ++ _, err = fmt.Fscanf(in, "%d", &pid) ++ if err != nil { ++ return &PasstEndpoint{}, fmt.Errorf("Failed to read passt pid: %v", err) ++ } ++ ++ endpoint := &PasstEndpoint{ ++ EndpointType: PasstEndpointType, ++ PasstPID: pid, ++ } ++ ++ return endpoint, nil ++} ++ ++func (endpoint *PasstEndpoint) Properties() NetworkInfo { ++ return endpoint.EndpointProperties ++} ++ ++func (endpoint *PasstEndpoint) Type() EndpointType { ++ return endpoint.EndpointType ++} ++ ++// unsupported ++func (endpoint *PasstEndpoint) HardwareAddr() string { ++ return "00:11:22:33:44:55" ++} ++ ++// unsupported ++func (endpoint *PasstEndpoint) Name() string { ++ return "" ++} ++ ++// unsupported ++func (endpoint *PasstEndpoint) NetworkPair() *NetworkInterfacePair { ++ return nil ++} ++ ++// PciPath returns the PCI path of the endpoint. ++func (endpoint *PasstEndpoint) PciPath() vcTypes.PciPath { ++ return endpoint.PCIPath ++} ++ ++// useless ++func (endpoint *PasstEndpoint) SetPciPath(pciPath vcTypes.PciPath) { ++ endpoint.PCIPath = pciPath ++} ++ ++func (endpoint *PasstEndpoint) SetProperties(properties NetworkInfo) { ++ endpoint.EndpointProperties = properties ++} ++ ++func (endpoint *PasstEndpoint) Attach(ctx context.Context, s *Sandbox) error { ++ h := s.hypervisor ++ if err := xConnectVMNetwork(ctx, endpoint, h); err != nil { ++ networkLogger().WithError(err).Error("Error attaching passt endpoint") ++ return err ++ } ++ ++ return h.AddDevice(ctx, endpoint, NetDev) ++} ++ ++func (endpoint *PasstEndpoint) Detach(ctx context.Context, netNsCreated bool, netNsPath string) error { ++ syscall.Kill(endpoint.PasstPID, syscall.SIGQUIT) ++ ++ return nil ++} ++ ++func (endpoint *PasstEndpoint) HotAttach(ctx context.Context, h Hypervisor) error { ++ return fmt.Errorf("HotAttach not supported by PasstEndpoint") ++} ++ ++func (endpoint *PasstEndpoint) HotDetach(ctx context.Context, h Hypervisor, netNsCreated bool, netNsPath string) error { ++ return fmt.Errorf("HotDetatch not supported by PasstEndpoint") ++} ++ ++func (endpoint *PasstEndpoint) save() persistapi.NetworkEndpoint { ++ return persistapi.NetworkEndpoint{ ++ Type: string(endpoint.Type()), ++ ++ Passt: &persistapi.PasstEndpoint{ ++ PasstPID: endpoint.PasstPID, ++ }, ++ } ++} ++ ++func (endpoint *PasstEndpoint) load(s persistapi.NetworkEndpoint) { ++ endpoint.EndpointType = PasstEndpointType ++ ++ if s.Passt != nil { ++ endpoint.PasstPID = s.Passt.PasstPID ++ } ++} ++ ++// unsupported ++func (endpoint *PasstEndpoint) GetRxRateLimiter() bool { ++ return false ++} ++ ++func (endpoint *PasstEndpoint) SetRxRateLimiter() error { ++ return fmt.Errorf("rx rate limiter is unsupported for physical endpoint") ++} ++ ++// unsupported ++func (endpoint *PasstEndpoint) GetTxRateLimiter() bool { ++ return false ++} ++ ++func (endpoint *PasstEndpoint) SetTxRateLimiter() error { ++ return fmt.Errorf("tx rate limiter is unsupported for physical endpoint") ++} +diff --git a/src/runtime/virtcontainers/persist/api/network.go b/src/runtime/virtcontainers/persist/api/network.go +index 51c3aac6..79d77cd9 100644 +--- a/src/runtime/virtcontainers/persist/api/network.go ++++ b/src/runtime/virtcontainers/persist/api/network.go +@@ -79,6 +79,10 @@ type VhostUserEndpoint struct { + PCIPath vcTypes.PciPath + } + ++type PasstEndpoint struct { ++ PasstPID int ++} ++ + // NetworkEndpoint contains network interface information + type NetworkEndpoint struct { + // One and only one of these below are not nil according to Type. +@@ -90,6 +94,7 @@ type NetworkEndpoint struct { + Tap *TapEndpoint `json:",omitempty"` + IPVlan *IPVlanEndpoint `json:",omitempty"` + Tuntap *TuntapEndpoint `json:",omitempty"` ++ Passt *PasstEndpoint `json:",omitempty"` + + Type string + } +diff --git a/src/runtime/virtcontainers/qemu_arch_base.go b/src/runtime/virtcontainers/qemu_arch_base.go +index 97cd6eb8..9ace0ace 100644 +--- a/src/runtime/virtcontainers/qemu_arch_base.go ++++ b/src/runtime/virtcontainers/qemu_arch_base.go +@@ -615,6 +615,17 @@ func genericNetwork(endpoint Endpoint, vhost, nestedRun bool, index int) (govmmQ + FDs: netPair.VMFds, + VhostFDs: netPair.VhostFds, + } ++ case *PasstEndpoint: ++ d = govmmQemu.NetDevice{ ++ Type: govmmQemu.PASST, ++ Driver: govmmQemu.VirtioNet, ++ ID: fmt.Sprintf("network-%d", index), ++ // TODO: Drop hardcoded MAC address, passt endpoint ++ // doesn't need to know it ++ MACAddress: "00:11:22:33:44:55", ++ DisableModern: nestedRun, ++ SocketPath: fmt.Sprintf("/tmp/kata-passt-%d.socket", index), ++ } + default: + return govmmQemu.NetDevice{}, fmt.Errorf("Unknown type for endpoint") + } +-- +2.28.0 + diff --git a/contrib/kata-containers/README.md b/contrib/kata-containers/README.md new file mode 100644 index 0000000..96acd5f --- /dev/null +++ b/contrib/kata-containers/README.md @@ -0,0 +1,302 @@ +This document shows how to set up a Kata Containers environment using passt to +implement user-mode networking: contrary to other networking models currently +implemented, this kind of setup requires no elevated privileges or capabilities +as far as networking is concerned. + +This proof-of-concept uses CRI-O as implementation container runtime, which is +controlled directly without resorting to a full Kubernetes environment. + +# Pre-requisites + +* Go and rust toolchains, typically provided by distribution packages +* the usual tools, such as git, make, etc. +* a 4.x qemu version, or more recent, with a working virtiofsd executable + (provided at least by Debian, Ubuntu, Fedora packages) + +# Fetch and prepare components + +## CRI-O + +CRI-O is the container runtime. It implements the Kubernetes CRI (Container +Runtime Interface) on one side -- and we'll handle that part manually with +`crictl` here, and on the other side it supports OCI (Open Container Initiative) +runtimes -- Kata Containers is one of them. + +### Fetch + + git clone https://github.com/cri-o/cri-o.git + +### Build + + cd cri-o + make + +### Install + +As root: + + make install + +### Configure + +Configuration is now at `/etc/crio/crio.conf`. This would also be the case for +distribution packages. Some specific configuration items for Kata Containers +are: + + # Cgroup management implementation used for the runtime. + cgroup_manager = "cgroupfs" + + # manage_ns_lifecycle determines whether we pin and remove namespaces + # and manage their lifecycle + manage_ns_lifecycle = true + +and the following section, that can be added at the end, defines a special type +of runtime, the `vm` type. This is needed to run the Kata Containers runtime +instead of the default `crun` choice: + + [crio.runtime.runtimes.kata] + runtime_path = "/usr/local/bin/containerd-shim-kata-v2" + runtime_type = "vm" + runtime_root = "/run/vc" + +Note that we don't have a containerd-shim-kata-v2 binary yet, we'll deal with +that in the next steps. + +## CNI plugins + +CNI plugins are actually binaries, run by CRI-O, used to configure networking on +the host as well as on the pod side. A few network topologies are offered, with +very limited capabilities. + +### Fetch + + git clone https://github.com/containernetworking/plugins + +### Build + + cd plugins + ./build_linux.sh + +### Install + +As root: + + mkdir -p /opt/cni/bin + cp bin/* /opt/cni/bin/ + + +### Configure + +The path where CNI configurations are located is configurable in +`/etc/crio/crio.conf`, see the `network_dir` parameter there. Assuming the +default value, we need to provide at least one configuration under +`/etc/cni/net.d/`. For example: + + # cat /etc/cni/net.d/50-kata-sandbox.conf + { + "cniVersion": "0.3.0", + "name": "crio-bridge", + "type": "bridge", + "bridge": "cni0", + "isGateway": true, + "ipMasq": true, + "ipam": { + "type": "host-local", + "subnet": "10.88.0.0/16", + "routes": [ + { "dst": "0.0.0.0/0" } + ] + } + } + +## crictl + +`crictl` is needed to control CRI-O in lieu of Kubernetes. + +### Fetch + + git clone https://github.com/kubernetes-sigs/cri-tools.git + +### Build + + cd cri-tools + make + +### Install + +As root: + + make install + +## mbuto + +We'll use `mbuto` to build a minimal virtual machine image for usage with the +Kata Containers runtime. + +### Fetch + + git clone https://mbuto.lameexcu.se/mbuto + +## Kata Containers + +### Fetch + + git clone https://github.com/kata-containers/kata-containers + +### Patch + +The current upstream version doesn't support the _passt_ networking model yet, +use the patch from this directory to add it: + + patch -p1 < 0001-virtcontainers-agent-Add-passt-networking-model-and-.patch + +### Build + + make -C src/runtime + make -C src/agent LIBC=gnu + +### Install + +As root: + + make -C src/runtime install + cp src/agent/target/x86_64-unknown-linux-gnu/release/kata-agent /usr/libexec/ + chmod 755 /usr/libexec/kata-agent + +### Build the Virtual Machine image + + cd mbuto + ./mbuto -f /tmp/kata.img + +See `mbuto -h` for additional parameters, such as choice of kernel version, +kernel modules, program add-ons, etc. `mbuto` will print some configuration +parameters to be used in the configuration of the Kata Containers runtime below. +For example: + + $ ./mbuto -c lz4 -f /tmp/kata.img + Not running as root, won't keep cpio mounted + Size: bin 12M lib 59M kmod 1.4M total 70M compressed 33M + Kata Containers [hypervisor.qemu] configuration: + + kernel = "/boot/vmlinuz-5.10.0-6-amd64" + initrd = "/tmp/kata.img" + +### Configure + +The configuration file at this point is located at +`/usr/share/defaults/kata-containers/configuration-qemu.toml`. Some parameters of general interest are: + + [hypervisor.qemu] + kernel = "/boot/vmlinuz-5.10.0-6-amd64" + initrd = "/tmp/kata.img" + +where we can use the values indicated earlier by `mbuto`. Currently, the default +path for the `virtiofsd` daemon doesn't work for all distributions, ensure that +it matches. For example, on Debian: + + virtio_fs_daemon = "/usr/lib/qemu/virtiofsd" + +we'll then need to enable the `passt` networking model for the runtime. In the +`[runtime]` section: + + internetworking_model=passt + +# Run an example container + +## Fetch + +We'll now need an image of a container to run as example. With `podman` +installed via distribution package, we can import one: + + podman pull docker.io/i386/busybox + +## Configure + +Now we can define configuration files for pod and container we want to create +and start: + + $ cat pod-config.json + { + "metadata": { + "name": "kata-sandbox", + "namespace": "default", + "attempt": 1, + "uid": "hdishd83djaidwnduwk28bcsb" + }, + "logDirectory": "/tmp", + "linux": { + } + } + + $ cat container-busybox.json + { + "metadata": { + "name": "kata-busybox" + }, + "image": { + "image": "docker.io/i386/busybox" + }, + "command": [ + "sleep", "6000" + ], + "log_path":"kata-busybox.log", + "linux": { + } + } + +## Run the container workload + +Assuming we have `pod-config.json` and `container-busybox.json` defined above, +we can now: + +### start CRI-O + + crio -l debug + +### create the pod and run a container inside it + + c=$(crictl start $(crictl create $(crictl runp --runtime=kata pod-config.json) container-dpdk.json pod-config.json)) + +### verify that addresses are properly configured + + crictl exec $c ip ad sh + +## Enable support for ICMP/ICMPv6 Echo Request + +_passt_ can replicate ICMP Echo Requests sent by the workload, and propagate the +replies back. However, as it's not running as root, we need to enable so-called +_ping_ sockets for unprivileged users. From the namespace created by CRI-O for +this container: + + sysctl -w net.ipv4.ping_group_range=net.ipv4.ping_group_range = 0 2147483647 + +# Troubleshooting + +## Redirect qemu's console output to file + +Agent errors and kernel messages should be accessible via named UNIX domain +socket at `/run/vc/vm/*/console.sock`, provided `agent.debug_console` is enabled +in `kernel_params` of `configuration.toml` but this won't work if the agent +doesn't start. In order to get those, we can wrap `qemu` and get, additionally, +all the output piped to a file: + + $ cat /usr/local/bin/qemu.sh + #!/bin/sh + + /usr/bin/qemu-system-x86_64 "$@" -serial file:/tmp/qemu.log 2>/tmp/qemu_err.log + +now, use this as path for `qemu` in `configuration.toml`: + + [hypervisor.qemu] + path = "/usr/local/bin/qemu.sh" + +and don't forget to add `console=ttyS0` to the kernel parameters, so that kernel +messages will also be included: + + kernel_params = "... console=ttyS0" + +## Debug console + +See the `kata-console` script in the +[kata-vfio-tools repository](https://github.com/dgibson/kata-vfio-tools) for a +convenient helper to access the debug console provided by the agent. -- 2.33.0