On Fri, Dec 19, 2025 at 05:45:15PM +0100, Laurent Vivier wrote:
Currently, each flow type (TCP, TCP_SPLICE, PING, UDP) has its own code to add or modify file descriptors in epoll. This leads to duplicated boilerplate code across icmp.c, tcp.c, tcp_splice.c, and udp_flow.c, each setting up epoll_ref unions and calling epoll_ctl() with flow-type-specific details.
Introduce flow_epoll_set() in flow.c to handle epoll operations for all flow types in a unified way. The function takes an explicit epoll type parameter, allowing it to handle not only flow socket types but also the TCP timer (EPOLL_TYPE_TCP_TIMER).
This will be needed to migrate queue pair from an epollfd to another.
Signed-off-by: Laurent Vivier
Love the concept. Couple of warts in execution.
--- flow.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++ flow.h | 3 +++ icmp.c | 9 ++----- tcp.c | 39 +++++++++-------------------- tcp_splice.c | 27 +++++++------------- udp_flow.c | 12 +-------- 6 files changed, 97 insertions(+), 63 deletions(-)
diff --git a/flow.c b/flow.c index 4f53486586cd..9c98102e3616 100644 --- a/flow.c +++ b/flow.c @@ -20,6 +20,7 @@ #include "flow.h" #include "flow_table.h" #include "repair.h" +#include "epoll_ctl.h"
const char *flow_state_str[] = { [FLOW_STATE_FREE] = "FREE", @@ -390,6 +391,75 @@ void flow_epollid_clear(struct flow_common *f) f->epollid = EPOLLFD_ID_INVALID; }
+/** + * flow_epoll_set() - Add or modify epoll registration for a flow socket + * @c: Execution context + * @type: epoll type + * @f: Flow to register socket for + * @events: epoll events to watch for + * @fd: File descriptor to register + * @sidei: Side index of the flow + * + * Return: 0 on success, -1 on error (from epoll_ctl()) + */ +int flow_epoll_set(const struct ctx *c, enum epoll_type type, + const struct flow_common *f, uint32_t events, int fd, + unsigned int sidei) +{ + struct epoll_event ev; + union epoll_ref ref; + int epollfd; + int m; + + ref.fd = fd; + ref.type = type; + + switch (type) { + case EPOLL_TYPE_TCP_SPLICE: + case EPOLL_TYPE_TCP: + ref.flowside = flow_sidx(f, sidei); + if (flow_in_epoll(f)) { + m = EPOLL_CTL_MOD; + epollfd = flow_epollfd(f); + } else { + m = EPOLL_CTL_ADD; + epollfd = c->epollfd; + }
It looks like the handling of flow_in_epoll() / flow_epoll() should be able to be common between all these epoll types. I'm guessing that doesn't quite work, because flow_in_epoll() isn't yet updated properly for all the types. Might this be cleaner if that was fixed first?
+ break; + case EPOLL_TYPE_TCP_TIMER: { + const struct tcp_tap_conn *conn = &((union flow *)f)->tcp; + + ref.flow = flow_idx(f); + m = conn->timer == -1 ? EPOLL_CTL_ADD : EPOLL_CTL_MOD; + epollfd = flow_epollfd(f); + break; + } + case EPOLL_TYPE_PING: + ref.flowside = flow_sidx(f, sidei); + m = EPOLL_CTL_ADD; + epollfd = flow_epollfd(f); + break;
That would allow this case to be the same as the TCP ones.
+ case EPOLL_TYPE_UDP: { + union { + flow_sidx_t sidx; + uint32_t data; + } fref = { .sidx = flow_sidx(f, sidei) };
I'm not sure quite what this union is about. EPOLL_TYPE_UDP uses epoll_ref.flowside, same as the non-timer TCP types. epoll_ref.udp is only for udp EPOLL_TYPE_UDP_LISTEN. Arguably we should change the name to reflect that, but I'm working towards unifying the epoll_ref variants for the various listening sockets anyway.
+ + ref.data = fref.data; + m = EPOLL_CTL_ADD; + epollfd = flow_epollfd(f);
Given the above, this should be identical to the EPOLL_TYPE_PING case already, and apart from the EPOLL_CTL_ADD handling, it's identical to the non-timer TCP cases.
+ break; + } + default: + ASSERT(0); + } + + ev.events = events; + ev.data.u64 = ref.u64; + + return epoll_ctl(epollfd, m, fd, &ev); +} + /** * flow_epollid_register() - Initialize the epoll id -> fd mapping * @epollid: epoll id to associate to diff --git a/flow.h b/flow.h index b43b0b1dd7f2..388fab124238 100644 --- a/flow.h +++ b/flow.h @@ -265,6 +265,9 @@ bool flow_in_epoll(const struct flow_common *f); int flow_epollfd(const struct flow_common *f); void flow_epollid_set(struct flow_common *f, int epollid); void flow_epollid_clear(struct flow_common *f); +int flow_epoll_set(const struct ctx *c, enum epoll_type type, + const struct flow_common *f, uint32_t events, int fd, + unsigned int sidei); void flow_epollid_register(int epollid, int epollfd); void flow_defer_handler(const struct ctx *c, const struct timespec *now); int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage, diff --git a/icmp.c b/icmp.c index 9564c4963f7b..235759567060 100644 --- a/icmp.c +++ b/icmp.c @@ -177,7 +177,6 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, union flow *flow = flow_alloc(); struct icmp_ping_flow *pingf; const struct flowside *tgt; - union epoll_ref ref;
if (!flow) return NULL; @@ -211,12 +210,8 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, goto cancel;
flow_epollid_set(&pingf->f, EPOLLFD_ID_DEFAULT); - - ref.type = EPOLL_TYPE_PING; - ref.flowside = FLOW_SIDX(flow, TGTSIDE); - ref.fd = pingf->sock; - - if (epoll_add(flow_epollfd(&pingf->f), EPOLLIN, ref) < 0) { + if (flow_epoll_set(c, EPOLL_TYPE_PING, &pingf->f, EPOLLIN, pingf->sock, + TGTSIDE) < 0) { close(pingf->sock); goto cancel; } diff --git a/tcp.c b/tcp.c index 6e6156098c12..2907af282551 100644 --- a/tcp.c +++ b/tcp.c @@ -530,14 +530,10 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) */ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { - int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; - union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; - struct epoll_event ev = { .data.u64 = ref.u64 }; - int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) - : c->epollfd; + uint32_t events;
if (conn->events == CLOSED) { + int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) : c->epollfd; if (flow_in_epoll(&conn->f)) epoll_del(epollfd, conn->sock); if (conn->timer != -1) @@ -545,22 +541,17 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) return 0; }
- ev.events = tcp_conn_epoll_events(conn->events, conn->flags); + events = tcp_conn_epoll_events(conn->events, conn->flags);
- if (epoll_ctl(epollfd, m, conn->sock, &ev)) + if (flow_epoll_set(c, EPOLL_TYPE_TCP, &conn->f, events, conn->sock, + !TAPSIDE(conn))) return -errno;
flow_epollid_set(&conn->f, EPOLLFD_ID_DEFAULT);
if (conn->timer != -1) { - union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, - .fd = conn->timer, - .flow = FLOW_IDX(conn) }; - struct epoll_event ev_t = { .data.u64 = ref_t.u64, - .events = EPOLLIN | EPOLLET }; - - if (epoll_ctl(flow_epollfd(&conn->f), EPOLL_CTL_MOD, - conn->timer, &ev_t)) + if (flow_epoll_set(c, EPOLL_TYPE_TCP_TIMER, &conn->f, + EPOLLIN | EPOLLET, conn->timer, 0)) return -errno; }
@@ -581,11 +572,6 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) return;
if (conn->timer == -1) { - union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER, - .flow = FLOW_IDX(conn) }; - struct epoll_event ev = { .data.u64 = ref.u64, - .events = EPOLLIN | EPOLLET }; - int epollfd = flow_epollfd(&conn->f); int fd;
fd = timerfd_create(CLOCK_MONOTONIC, 0); @@ -593,18 +579,17 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) flow_dbg_perror(conn, "failed to get timer"); if (fd > -1) close(fd); - conn->timer = -1; return; } - conn->timer = fd; - ref.fd = conn->timer;
- if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { + if (flow_epoll_set(c, EPOLL_TYPE_TCP_TIMER, &conn->f, + EPOLLIN | EPOLLET, fd, 0)) { flow_dbg_perror(conn, "failed to add timer"); - close(conn->timer); - conn->timer = -1; + close(fd); return; } + + conn->timer = fd; }
if (conn->flags & ACK_TO_TAP_DUE) { diff --git a/tcp_splice.c b/tcp_splice.c index bf4ff466de07..7367f435367b 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -143,24 +143,15 @@ static uint32_t tcp_splice_conn_epoll_events(uint16_t events, unsigned sidei) static int tcp_splice_epoll_ctl(const struct ctx *c, struct tcp_splice_conn *conn) { - int epollfd = flow_in_epoll(&conn->f) ? flow_epollfd(&conn->f) - : c->epollfd; - int m = flow_in_epoll(&conn->f) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; - const union epoll_ref ref[SIDES] = { - { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0], - .flowside = FLOW_SIDX(conn, 0) }, - { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1], - .flowside = FLOW_SIDX(conn, 1) } - }; - struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, - { .data.u64 = ref[1].u64 } }; - - ev[0].events = tcp_splice_conn_epoll_events(conn->events, 0); - ev[1].events = tcp_splice_conn_epoll_events(conn->events, 1); - - - if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) || - epoll_ctl(epollfd, m, conn->s[1], &ev[1])) { + uint32_t events[2]; + + events[0] = tcp_splice_conn_epoll_events(conn->events, 0); + events[1] = tcp_splice_conn_epoll_events(conn->events, 1); + + if (flow_epoll_set(c, EPOLL_TYPE_TCP_SPLICE, &conn->f, events[0], + conn->s[0], 0) || + flow_epoll_set(c, EPOLL_TYPE_TCP_SPLICE, &conn->f, events[1], + conn->s[1], 1)) { int ret = -errno; flow_perror(conn, "ERROR on epoll_ctl()"); return ret; diff --git a/udp_flow.c b/udp_flow.c index 33f29f21e69e..42f3622d621c 100644 --- a/udp_flow.c +++ b/udp_flow.c @@ -74,11 +74,6 @@ static int udp_flow_sock(const struct ctx *c, { const struct flowside *side = &uflow->f.side[sidei]; uint8_t pif = uflow->f.pif[sidei]; - union { - flow_sidx_t sidx; - uint32_t data; - } fref = { .sidx = FLOW_SIDX(uflow, sidei) }; - union epoll_ref ref; int rc; int s;
@@ -88,13 +83,8 @@ static int udp_flow_sock(const struct ctx *c, return s; }
- ref.type = EPOLL_TYPE_UDP; - ref.data = fref.data; - ref.fd = s; - flow_epollid_set(&uflow->f, EPOLLFD_ID_DEFAULT); - - rc = epoll_add(flow_epollfd(&uflow->f), EPOLLIN, ref); + rc = flow_epoll_set(c, EPOLL_TYPE_UDP, &uflow->f, EPOLLIN, s, sidei); if (rc < 0) { close(s); return rc; -- 2.51.1
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson