On Thu, Oct 09, 2025 at 03:04:05PM +0200, Laurent Vivier wrote:
Centralize epoll_add() and epoll_del() helper functions into new epoll_ctl.c/h files.
This also moves the union epoll_ref definition from passt.h to epoll_ctl.h where it's more logically placed.
The new epoll_add() helper simplifies adding file descriptors to epoll by taking an epoll_ref and events, handling error reporting consistently across all call sites.
Signed-off-by: Laurent Vivier
Nice cleanup. One nit noted below.
--- Makefile | 22 +++++++++++----------- epoll_ctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ epoll_ctl.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ icmp.c | 4 +--- passt.c | 2 +- passt.h | 34 ---------------------------------- pasta.c | 7 +++---- repair.c | 14 +++++--------- tap.c | 13 ++++--------- tcp.c | 2 +- tcp_splice.c | 2 +- udp.c | 2 +- udp_flow.c | 1 + util.c | 22 +++------------------- util.h | 4 +++- vhost_user.c | 8 ++------ vu_common.c | 2 +- 17 files changed, 134 insertions(+), 101 deletions(-) create mode 100644 epoll_ctl.c create mode 100644 epoll_ctl.h
diff --git a/Makefile b/Makefile index 3328f8324140..91e037b8fd3c 100644 --- a/Makefile +++ b/Makefile @@ -37,23 +37,23 @@ FLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) FLAGS += -DVERSION=\"$(VERSION)\" FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
-PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ - icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ - ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \ - repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \ - udp_vu.c util.c vhost_user.c virtio.c vu_common.c +PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c epoll_ctl.c \ + flow.c fwd.c icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c \ + log.c mld.c ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c \ + pif.c repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c \ + udp_flow.c udp_vu.c util.c vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c PASST_REPAIR_SRCS = passt-repair.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
-PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ - flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ - lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \ - pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \ - tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \ - udp_vu.h util.h vhost_user.h virtio.h vu_common.h +PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h epoll_ctl.h \ + flow.h fwd.h flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h \ + isolation.h lineread.h log.h migrate.h ndp.h netlink.h packet.h \ + passt.h pasta.h pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h \ + tcp_conn.h tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h \ + udp_internal.h udp_vu.h util.h vhost_user.h virtio.h vu_common.h HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include
\nint main(){int a=getrandom(0, 0, 0);} diff --git a/epoll_ctl.c b/epoll_ctl.c new file mode 100644 index 000000000000..0a06350f87a5 --- /dev/null +++ b/epoll_ctl.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* epoll_ctl.c - epoll manipulation helpers + * + * Copyright Red Hat + * Author: Laurent Vivier + */ + +#include + +#include "epoll_ctl.h" + +/** + * epoll_add() - Add a file descriptor to an epollfd + * @epollfd: epoll file descriptor to add to + * @events: epoll events + * @ref: epoll reference for the file descriptor (includes fd and metadata) + * + * Return: 0 on success, negative errno on failure + */ +int epoll_add(int epollfd, uint32_t events, union epoll_ref *ref) +{ + struct epoll_event ev; + int ret; + + ev.events = events; + ev.data.u64 = ref->u64; + + ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, ref->fd, &ev); + if (ret == -1) { + ret = -errno; + warn("Failed to add fd to epoll: %s", strerror_(-ret));
I think this should be err() not warn(). If we're unable to add an fd to epoll things are almost certainly going to go badly wrong.
+ } + + return ret; +} + +/** + * epoll_del() - Remove a file descriptor from an epollfd + * @epollfd: epoll file descriptor to remove from + * @fd: File descriptor to remove + */ +void epoll_del(int epollfd, int fd) +{ + epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, NULL); +} diff --git a/epoll_ctl.h b/epoll_ctl.h new file mode 100644 index 000000000000..cf92b0f63f26 --- /dev/null +++ b/epoll_ctl.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: Laurent Vivier
+ */ + +#ifndef EPOLL_CTL_H +#define EPOLL_CTL_H + +#include + +#include "util.h" +#include "passt.h" +#include "epoll_type.h" +#include "flow.h" +#include "tcp.h" +#include "udp.h" + +/** + * union epoll_ref - Breakdown of reference for epoll fd bookkeeping + * @type: Type of fd (tells us what to do with events) + * @fd: File descriptor number (implies < 2^24 total descriptors) + * @flow: Index of the flow this fd is linked to + * @tcp_listen: TCP-specific reference part for listening sockets + * @udp: UDP-specific reference part + * @data: Data handled by protocol handlers + * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone + * @queue: vhost-user queue index for this fd + * @u64: Opaque reference for epoll_ctl() and epoll_wait() + */ +union epoll_ref { + struct { + enum epoll_type type:8; + int32_t fd:FD_REF_BITS; + union { + uint32_t flow; + flow_sidx_t flowside; + union tcp_listen_epoll_ref tcp_listen; + union udp_listen_epoll_ref udp; + uint32_t data; + int nsdir_fd; + int queue; + }; + }; + uint64_t u64; +}; +static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), + "epoll_ref must have same size as epoll_data"); + +int epoll_add(int epollfd, uint32_t events, union epoll_ref *ref); +void epoll_del(int epollfd, int fd); +#endif /* EPOLL_CTL_H */ diff --git a/icmp.c b/icmp.c index bd3108a21675..c26561da80bf 100644 --- a/icmp.c +++ b/icmp.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -23,10 +22,8 @@ #include #include #include -#include #include #include -#include #include #include
@@ -41,6 +38,7 @@ #include "inany.h" #include "icmp.h" #include "flow_table.h" +#include "epoll_ctl.h" #define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ #define ICMP_NUM_IDS (1U << 16) diff --git a/passt.c b/passt.c index 31fbb75b1b12..e595f13a56f7 100644 --- a/passt.c +++ b/passt.c @@ -19,7 +19,6 @@ * created in a separate network namespace). */
-#include
#include #include #include @@ -53,6 +52,7 @@ #include "vu_common.h" #include "migrate.h" #include "repair.h" +#include "epoll_ctl.h" #define EPOLL_EVENTS 8
diff --git a/passt.h b/passt.h index 0075eb4b3b16..befe56bb167b 100644 --- a/passt.h +++ b/passt.h @@ -35,40 +35,6 @@ union epoll_ref; #define MAC_OUR_LAA \ ((uint8_t [ETH_ALEN]){0x9a, 0x55, 0x9a, 0x55, 0x9a, 0x55})
-/** - * union epoll_ref - Breakdown of reference for epoll fd bookkeeping - * @type: Type of fd (tells us what to do with events) - * @fd: File descriptor number (implies < 2^24 total descriptors) - * @flow: Index of the flow this fd is linked to - * @tcp_listen: TCP-specific reference part for listening sockets - * @udp: UDP-specific reference part - * @icmp: ICMP-specific reference part - * @data: Data handled by protocol handlers - * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone - * @queue: vhost-user queue index for this fd - * @u64: Opaque reference for epoll_ctl() and epoll_wait() - */ -union epoll_ref { - struct { - enum epoll_type type:8; -#define FD_REF_BITS 24 -#define FD_REF_MAX ((int)MAX_FROM_BITS(FD_REF_BITS)) - int32_t fd:FD_REF_BITS; - union { - uint32_t flow; - flow_sidx_t flowside; - union tcp_listen_epoll_ref tcp_listen; - union udp_listen_epoll_ref udp; - uint32_t data; - int nsdir_fd; - int queue; - }; - }; - uint64_t u64; -}; -static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), - "epoll_ref must have same size as epoll_data"); - /* Large enough for ~128 maximum size frames */ #define PKT_BUF_BYTES (8UL << 20)
diff --git a/pasta.c b/pasta.c index 687406b6e736..e905f6d33b95 100644 --- a/pasta.c +++ b/pasta.c @@ -27,7 +27,6 @@ #include
#include #include -#include #include #include #include @@ -49,6 +48,7 @@ #include "isolation.h" #include "netlink.h" #include "log.h" +#include "epoll_ctl.h" #define HOSTNAME_PREFIX "pasta-"
@@ -444,7 +444,6 @@ static int pasta_netns_quit_timer(void) */ void pasta_netns_quit_init(const struct ctx *c) { - struct epoll_event ev = { .events = EPOLLIN }; int flags = O_NONBLOCK | O_CLOEXEC; struct statfs s = { 0 }; bool try_inotify = true; @@ -487,8 +486,8 @@ void pasta_netns_quit_init(const struct ctx *c) die("netns monitor file number %i too big, exiting", fd);
ref.fd = fd; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev); + + epoll_add(c->epollfd, EPOLLIN, &ref); }
/** diff --git a/repair.c b/repair.c index f6b1bf36479c..c8f4737fa62a 100644 --- a/repair.c +++ b/repair.c @@ -22,6 +22,7 @@ #include "inany.h" #include "flow.h" #include "flow_table.h" +#include "epoll_ctl.h"
#include "repair.h"
@@ -47,7 +48,6 @@ static int repair_nfds; void repair_sock_init(const struct ctx *c) { union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN }; - struct epoll_event ev = { 0 };
if (c->fd_repair_listen == -1) return; @@ -58,9 +58,7 @@ void repair_sock_init(const struct ctx *c) }
ref.fd = c->fd_repair_listen; - ev.events = EPOLLIN | EPOLLHUP | EPOLLET; - ev.data.u64 = ref.u64; - if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev)) + if (epoll_add(c->epollfd, EPOLLIN | EPOLLHUP | EPOLLET, &ref)) err_perror("repair helper socket epoll_ctl(), won't migrate"); }
@@ -74,7 +72,6 @@ void repair_sock_init(const struct ctx *c) int repair_listen_handler(struct ctx *c, uint32_t events) { union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR }; - struct epoll_event ev = { 0 }; struct ucred ucred; socklen_t len; int rc; @@ -112,10 +109,9 @@ int repair_listen_handler(struct ctx *c, uint32_t events) info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
ref.fd = c->fd_repair; - ev.events = EPOLLHUP | EPOLLET; - ev.data.u64 = ref.u64; - if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) { - rc = errno; + + rc = epoll_add(c->epollfd, EPOLLHUP | EPOLLET, &ref); + if (rc < 0) { debug_perror("epoll_ctl() on TCP_REPAIR helper socket"); close(c->fd_repair); c->fd_repair = -1; diff --git a/tap.c b/tap.c index 134c37a72979..873874796f79 100644 --- a/tap.c +++ b/tap.c @@ -26,7 +26,6 @@ #include
#include #include -#include #include #include #include @@ -61,6 +60,7 @@ #include "log.h" #include "vhost_user.h" #include "vu_common.h" +#include "epoll_ctl.h" /* Maximum allowed frame lengths (including L2 header) */
@@ -1327,14 +1327,12 @@ static void tap_backend_show_hints(struct ctx *c) static void tap_sock_unix_init(const struct ctx *c) { union epoll_ref ref = { .type = EPOLL_TYPE_TAP_LISTEN }; - struct epoll_event ev = { 0 };
listen(c->fd_tap_listen, 0);
ref.fd = c->fd_tap_listen; - ev.events = EPOLLIN | EPOLLET; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev); + + epoll_add(c->epollfd, EPOLLIN | EPOLLET, &ref); }
/** @@ -1343,7 +1341,6 @@ static void tap_sock_unix_init(const struct ctx *c) */ static void tap_start_connection(const struct ctx *c) { - struct epoll_event ev = { 0 }; union epoll_ref ref = { 0 };
ref.fd = c->fd_tap; @@ -1359,9 +1356,7 @@ static void tap_start_connection(const struct ctx *c) break; }
- ev.events = EPOLLIN | EPOLLRDHUP; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); + epoll_add(c->epollfd, EPOLLIN | EPOLLRDHUP, &ref);
if (c->ifi4) arp_send_init_req(c); diff --git a/tcp.c b/tcp.c index 9100c67e57e0..c2d842bbdf4f 100644 --- a/tcp.c +++ b/tcp.c @@ -279,7 +279,6 @@ #include
#include #include -#include #include #include #include @@ -309,6 +308,7 @@ #include "tcp_internal.h" #include "tcp_buf.h" #include "tcp_vu.h" +#include "epoll_ctl.h" /* * The size of TCP header (including options) is given by doff (Data Offset) diff --git a/tcp_splice.c b/tcp_splice.c index 666ee62b738f..6f21184bdc55 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -44,7 +44,6 @@ #include
#include #include -#include #include #include @@ -56,6 +55,7 @@ #include "siphash.h" #include "inany.h" #include "flow.h" +#include "epoll_ctl.h"
#include "flow_table.h"
diff --git a/udp.c b/udp.c index 86585b7e0942..3812d5c2336f 100644 --- a/udp.c +++ b/udp.c @@ -94,7 +94,6 @@ #include
#include #include -#include #include #include #include @@ -115,6 +114,7 @@ #include "flow_table.h" #include "udp_internal.h" #include "udp_vu.h" +#include "epoll_ctl.h" #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
diff --git a/udp_flow.c b/udp_flow.c index 84973f807167..d9c75f1bb1d8 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -15,6 +15,7 @@ #include "passt.h" #include "flow_table.h" #include "udp_internal.h" +#include "epoll_ctl.h"
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
diff --git a/util.c b/util.c index 88a91b1100f5..b2490123590a 100644 --- a/util.c +++ b/util.c @@ -18,7 +18,6 @@ #include
#include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include "packet.h" #include "log.h" #include "pcap.h" +#include "epoll_ctl.h" #ifdef HAS_GETRANDOM #include #endif @@ -58,7 +58,6 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, sa_family_t af = ((const struct sockaddr *)sa)->sa_family; union epoll_ref ref = { .type = type, .data = data }; bool freebind = false; - struct epoll_event ev; int fd, y = 1, ret; uint8_t proto; int socktype; @@ -172,13 +171,9 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type, return ret; } - ev.events = EPOLLIN; - ev.data.u64 = ref.u64; - if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, fd, &ev) == -1) { - ret = -errno; - warn("L4 epoll_ctl: %s", strerror_(-ret)); + ret = epoll_add(c->epollfd, EPOLLIN, &ref); + if (ret < 0) return ret; - }
return fd; } @@ -994,17 +989,6 @@ void raw_random(void *buf, size_t buflen) die("Unexpected EOF on random data source"); }
-/** - * epoll_del() - Remove a file descriptor from our passt epoll - * @epollfd: epoll file descriptor to add to - * @fd: File descriptor to remove - */ -void epoll_del(int epollfd, int fd) -{ - epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, NULL); - -} - /** * encode_domain_name() - Encode domain name according to RFC 1035, section 3.1 * @buf: Buffer to fill in with encoded domain name diff --git a/util.h b/util.h index c61cbef357aa..8e4b4c5c6032 100644 --- a/util.h +++ b/util.h @@ -193,6 +193,9 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, #define SNDBUF_BIG (4ULL * 1024 * 1024) #define SNDBUF_SMALL (128ULL * 1024)
+#define FD_REF_BITS 24 +#define FD_REF_MAX ((int)MAX_FROM_BITS(FD_REF_BITS)) + #include
#include #include @@ -300,7 +303,6 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m) #define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__) void raw_random(void *buf, size_t buflen); -void epoll_del(int epollfd, int fd);
/* * Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror, diff --git a/vhost_user.c b/vhost_user.c index f8324c59cc6c..aea1e2cbcea5 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -32,8 +32,6 @@ #include
#include #include -#include -#include #include #include #include @@ -45,6 +43,7 @@ #include "vhost_user.h" #include "pcap.h" #include "migrate.h" +#include "epoll_ctl.h" /* vhost-user version we are compatible with */ #define VHOST_USER_VERSION 1 @@ -753,11 +752,8 @@ static void vu_set_watch(const struct vu_dev *vdev, int idx) .fd = vdev->vq[idx].kick_fd, .queue = idx }; - struct epoll_event ev = { 0 };
- ev.data.u64 = ref.u64; - ev.events = EPOLLIN; - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); + epoll_add(vdev->context->epollfd, EPOLLIN, &ref); }
/** diff --git a/vu_common.c b/vu_common.c index b716070ea3c3..b13b7c308fd8 100644 --- a/vu_common.c +++ b/vu_common.c @@ -6,7 +6,6 @@ */
#include
-#include #include #include #include @@ -19,6 +18,7 @@ #include "pcap.h" #include "vu_common.h" #include "migrate.h" +#include "epoll_ctl.h" #define VU_MAX_TX_BUFFER_NB 2
-- 2.50.1
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson