On Fri, Oct 03, 2025 at 05:27:15PM +0200, Laurent Vivier wrote:
The in_epoll boolean flag in tcp_tap_conn and tcp_splice_conn only tracked whether a connection was registered with epoll, not which epoll instance. This limited flexibility for future multi-epoll support.
Replace the boolean with an epollfd field in flow_common that serves dual purpose: zero indicates not registered (replacing in_epoll=false), non-zero
Don't use 0, since that's a valid fd.
stores the actual epoll fd (replacing in_epoll=true).
I am a bit nervous about adding 31-bits to every flow, since I think we're fairly close to a cacheline threshold. I'm not sure we really can add any less to flow_common, though, given alignment. Then again... we probably don't need 8 bites each for TYPE and STATE, so those could be packed tighter. Then we could use a limited-bits index into a table of epollfds, rather than a raw fd. Much uglier, but maybe worth it?
This change also simplifies tcp_timer_ctl() by removing the need to pass the context 'c', since the epoll fd is now directly accessible from the connection structure.
Signed-off-by: Laurent Vivier
--- flow.c | 2 +- flow.h | 2 ++ tcp.c | 36 ++++++++++++++++++------------------ tcp_conn.h | 8 +------- tcp_splice.c | 23 +++++++++++------------ 5 files changed, 33 insertions(+), 38 deletions(-) diff --git a/flow.c b/flow.c index b14e9d8b63ff..7c61ee87ae9d 100644 --- a/flow.c +++ b/flow.c @@ -827,7 +827,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) case FLOW_TCP_SPLICE: closed = tcp_splice_flow_defer(&flow->tcp_splice); if (!closed && timer) - tcp_splice_timer(c, &flow->tcp_splice); + tcp_splice_timer(&flow->tcp_splice); break; case FLOW_PING4: case FLOW_PING6: diff --git a/flow.h b/flow.h index ef138b83add8..592d9e3792f6 100644 --- a/flow.h +++ b/flow.h @@ -175,6 +175,7 @@ int flowside_connect(const struct ctx *c, int s, * struct flow_common - Common fields for packet flows * @state: State of the flow table entry * @type: Type of packet flow + * @epollfd: epoll instance flow is registered with (0 if not registered) * @pif[]: Interface for each side of the flow * @side[]: Information for each side of the flow */ @@ -190,6 +191,7 @@ struct flow_common { static_assert(sizeof(uint8_t) * 8 >= FLOW_TYPE_BITS, "Not enough bits for type field"); #endif + int epollfd;
This should go after pif[] - it's a less logical order, but it will save 2 bytes of alignment padding.
uint8_t pif[SIDES]; struct flowside side[SIDES]; }; diff --git a/tcp.c b/tcp.c index 04725deabb65..c995b40f38f8 100644 --- a/tcp.c +++ b/tcp.c @@ -504,25 +504,26 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) */ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = conn->f.epollfd ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, .flowside = FLOW_SIDX(conn, !TAPSIDE(conn)), }; struct epoll_event ev = { .data.u64 = ref.u64 }; + int epollfd = conn->f.epollfd ? conn->f.epollfd : c->epollfd;
if (conn->events == CLOSED) { - if (conn->in_epoll) - epoll_del(c->epollfd, conn->sock); + if (conn->f.epollfd) + epoll_del(epollfd, conn->sock); if (conn->timer != -1) - epoll_del(c->epollfd, conn->timer); + epoll_del(epollfd, conn->timer); return 0; }
ev.events = tcp_conn_epoll_events(conn->events, conn->flags);
- if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) + if (epoll_ctl(epollfd, m, conn->sock, &ev)) return -errno;
- conn->in_epoll = true; + conn->f.epollfd = epollfd;
if (conn->timer != -1) { union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, @@ -531,7 +532,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) struct epoll_event ev_t = { .data.u64 = ref_t.u64, .events = EPOLLIN | EPOLLET };
- if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t)) + if (epoll_ctl(conn->f.epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t)) return -errno; }
@@ -540,12 +541,11 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
/** * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed - * @c: Execution context * @conn: Connection pointer * * #syscalls timerfd_create timerfd_settime */ -static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_timer_ctl(struct tcp_tap_conn *conn) { struct itimerspec it = { { 0 }, { 0 } };
@@ -570,7 +570,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) } conn->timer = fd;
- if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { + if (epoll_ctl(conn->f.epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { flow_dbg_perror(conn, "failed to add timer"); close(conn->timer); conn->timer = -1; @@ -628,7 +628,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, * flags and factor this into the logic below. */ if (flag == ACK_FROM_TAP_DUE) - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn);
return; } @@ -644,7 +644,7 @@ void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE))) - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn); }
/** @@ -699,7 +699,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_epoll_ctl(c, conn);
if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn); }
/** @@ -1732,7 +1732,7 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, seq, conn->seq_from_tap);
tcp_send_flag(c, conn, ACK); - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn);
if (p->count == 1) { tcp_tap_window_update(c, conn, @@ -2375,7 +2375,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
if (conn->flags & ACK_TO_TAP_DUE) { tcp_send_flag(c, conn, ACK_IF_NEEDED); - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn); } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) { flow_dbg(conn, "handshake timeout"); @@ -2397,7 +2397,7 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) return;
tcp_data_from_sock(c, conn); - tcp_timer_ctl(c, conn); + tcp_timer_ctl(conn); } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -3445,7 +3445,7 @@ int tcp_flow_migrate_source_ext(const struct ctx *c, if (c->migrate_no_linger) close(s); else - epoll_del(c->epollfd, s); + epoll_del(conn->f.epollfd, s);
/* Adjustments unrelated to FIN segments: sequence numbers we dumped are * based on the end of the queues. @@ -3594,7 +3594,7 @@ static int tcp_flow_repair_connect(const struct ctx *c, return rc; }
- conn->in_epoll = 0; + conn->f.epollfd = 0; conn->timer = -1; conn->listening_sock = -1;
diff --git a/tcp_conn.h b/tcp_conn.h index 38b5c541f003..81333122d531 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -12,7 +12,6 @@ /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) * @f: Generic flow information - * @in_epoll: Is the connection in the epoll set? * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT * @ws_from_tap: Window scaling factor advertised from tap/guest * @ws_to_tap: Window scaling factor advertised to tap/guest @@ -36,8 +35,6 @@ struct tcp_tap_conn { /* Must be first element */ struct flow_common f;
- bool in_epoll :1; - #define TCP_RETRANS_BITS 3 unsigned int retrans :TCP_RETRANS_BITS; #define TCP_MAX_RETRANS MAX_FROM_BITS(TCP_RETRANS_BITS) @@ -196,7 +193,6 @@ struct tcp_tap_transfer_ext { * @written: Bytes written (not fully written from one other side read) * @events: Events observed/actions performed on connection * @flags: Connection flags (attributes, not events) - * @in_epoll: Is the connection in the epoll set? */ struct tcp_splice_conn { /* Must be first element */ @@ -220,8 +216,6 @@ struct tcp_splice_conn { #define RCVLOWAT_SET(sidei_) ((sidei_) ? BIT(1) : BIT(0)) #define RCVLOWAT_ACT(sidei_) ((sidei_) ? BIT(3) : BIT(2)) #define CLOSING BIT(4) - - bool in_epoll :1; };
/* Socket pools */ @@ -245,7 +239,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd bool tcp_flow_is_established(const struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn); -void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn); +void tcp_splice_timer(struct tcp_splice_conn *conn); int tcp_conn_pool_sock(int pool[]); int tcp_conn_sock(sa_family_t af); int tcp_sock_refill_pool(int pool[], sa_family_t af); diff --git a/tcp_splice.c b/tcp_splice.c index 666ee62b738f..49fb43473de6 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -149,7 +149,7 @@ static void tcp_splice_conn_epoll_events(uint16_t events, static int tcp_splice_epoll_ctl(const struct ctx *c, struct tcp_splice_conn *conn) { - int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = conn->f.epollfd ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; const union epoll_ref ref[SIDES] = { { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0], .flowside = FLOW_SIDX(conn, 0) }, @@ -158,28 +158,28 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, }; struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, { .data.u64 = ref[1].u64 } }; + int epollfd = conn->f.epollfd ? conn->f.epollfd : c->epollfd;
tcp_splice_conn_epoll_events(conn->events, ev);
- if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) || - epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) { + + if (epoll_ctl(epollfd, m, conn->s[0], &ev[0]) || + epoll_ctl(epollfd, m, conn->s[1], &ev[1])) { int ret = -errno; flow_perror(conn, "ERROR on epoll_ctl()"); return ret; } - - conn->in_epoll = true; + conn->f.epollfd = epollfd;
return 0; }
/** * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag - * @c: Execution context * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset */ -static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, +static void conn_flag_do(struct tcp_splice_conn *conn, unsigned long flag) { if (flag & (flag - 1)) { @@ -204,15 +204,15 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, }
if (flag == CLOSING) { - epoll_del(c->epollfd, conn->s[0]); - epoll_del(c->epollfd, conn->s[1]); + epoll_del(conn->f.epollfd, conn->s[0]); + epoll_del(conn->f.epollfd, conn->s[1]); } }
#define conn_flag(c, conn, flag) \ do { \ flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ - conn_flag_do(c, conn, flag); \ + conn_flag_do(conn, flag); \ } while (0)
/** @@ -751,10 +751,9 @@ void tcp_splice_init(struct ctx *c)
/** * tcp_splice_timer() - Timer for spliced connections - * @c: Execution context * @conn: Connection to handle */ -void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn) +void tcp_splice_timer(struct tcp_splice_conn *conn) { unsigned sidei;
-- 2.50.1
-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson