There are still some thing dispatched explicitly in tcp.c and tcp_splice.c that make more sense to dispatch at the flow level, even with the current "stub" version of the flow table. This should be independent of my other outstanding series. David Gibson (8): flow: Make flow_table.h #include the protocol specific headers it needs treewide: Standardise on 'now' for current timestamp variables tcp, tcp_splice: Remove redundant handling from tcp_timer() tcp, tcp_splice: Move per-type cleanup logic into per-type helpers flow, tcp: Add flow-centric dispatch for deferred flow handling flow, tcp: Add handling for per-flow timers epoll: Better handling of number of epoll types tcp, tcp_splice: Avoid double layered dispatch for connected TCP sockets flow.c | 33 +++++++++++++++++++- flow.h | 3 ++ flow_table.h | 2 ++ icmp.c | 12 ++++---- icmp.h | 2 +- log.c | 34 ++++++++++----------- passt.c | 18 ++++++++--- passt.h | 8 +++-- tcp.c | 86 +++++++++++----------------------------------------- tcp.h | 2 +- tcp_conn.h | 5 +-- tcp_splice.c | 33 ++++++++++---------- tcp_splice.h | 4 +-- udp.c | 16 +++++----- udp.h | 2 +- 15 files changed, 129 insertions(+), 131 deletions(-) -- 2.43.0
flow_table.h, the lower level flow header relies on having the struct definitions for every protocol specific flow type - so far that means tcp_conn.h. It doesn't include it itself, so tcp_conn.h must be included before flow_table.h. That's ok for now, but as we use the flow table for more things, flow_table.h will need the structs for all of them, which means the protocol specific .c files would need to include tcp_conn.h _and_ the equivalents for every other flow type before flow_table.h every time, which is weird. So, although we *mostly* lean towards the include style where .c files need to handle the include dependencies, in this case it makes more sense to have flow_table.h include all the protocol specific headers it needs. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 1 - flow_table.h | 2 ++ tcp.c | 1 - tcp_splice.c | 1 - 4 files changed, 2 insertions(+), 3 deletions(-) diff --git a/flow.c b/flow.c index d24726d..a1c0a34 100644 --- a/flow.c +++ b/flow.c @@ -14,7 +14,6 @@ #include "siphash.h" #include "inany.h" #include "flow.h" -#include "tcp_conn.h" #include "flow_table.h" const char *flow_type_str[] = { diff --git a/flow_table.h b/flow_table.h index 0dee66f..e805f10 100644 --- a/flow_table.h +++ b/flow_table.h @@ -7,6 +7,8 @@ #ifndef FLOW_TABLE_H #define FLOW_TABLE_H +#include "tcp_conn.h" + /** * union flow - Descriptor for a logical packet flow (e.g. connection) * @f: Fields common between all variants diff --git a/tcp.c b/tcp.c index f506cfd..ce53ee0 100644 --- a/tcp.c +++ b/tcp.c @@ -298,7 +298,6 @@ #include "inany.h" #include "flow.h" -#include "tcp_conn.h" #include "flow_table.h" /* Sides of a flow as we use them in "tap" connections */ diff --git a/tcp_splice.c b/tcp_splice.c index 69ea79d..052f989 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -56,7 +56,6 @@ #include "inany.h" #include "flow.h" -#include "tcp_conn.h" #include "flow_table.h" #define MAX_PIPE_SIZE (8UL * 1024 * 1024) -- 2.43.0
In a number of places we pass around a struct timespec representing the (more or less) current time. Sometimes we call it 'now', and sometimes we call it 'ts'. Standardise on the more informative 'now'. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 12 ++++++------ icmp.h | 2 +- log.c | 34 +++++++++++++++++----------------- tcp.c | 6 +++--- tcp.h | 2 +- udp.c | 16 ++++++++-------- udp.h | 2 +- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/icmp.c b/icmp.c index a1de8ae..a9fbcb2 100644 --- a/icmp.c +++ b/icmp.c @@ -290,14 +290,14 @@ fail_sock: * @c: Execution context * @v6: Set for IPv6 echo identifier bindings * @id: Echo identifier, host order - * @ts: Timestamp from caller + * @now: Current timestamp */ static void icmp_timer_one(const struct ctx *c, int v6, uint16_t id, - const struct timespec *ts) + const struct timespec *now) { struct icmp_id_sock *id_map = &icmp_id_map[v6 ? V6 : V4][id]; - if (ts->tv_sec - id_map->ts <= ICMP_ECHO_TIMEOUT) + if (now->tv_sec - id_map->ts <= ICMP_ECHO_TIMEOUT) return; bitmap_clear(icmp_act[v6 ? V6 : V4], id); @@ -311,9 +311,9 @@ static void icmp_timer_one(const struct ctx *c, int v6, uint16_t id, /** * icmp_timer() - Scan activity bitmap for identifiers with timed events * @c: Execution context - * @ts: Timestamp from caller + * @now: Current timestamp */ -void icmp_timer(const struct ctx *c, const struct timespec *ts) +void icmp_timer(const struct ctx *c, const struct timespec *now) { long *word, tmp; unsigned int i; @@ -325,7 +325,7 @@ v6: tmp = *word; while ((n = ffsl(tmp))) { tmp &= ~(1UL << (n - 1)); - icmp_timer_one(c, v6, i * 8 + n - 1, ts); + icmp_timer_one(c, v6, i * 8 + n - 1, now); } } diff --git a/icmp.h b/icmp.h index 44cc495..1a08594 100644 --- a/icmp.h +++ b/icmp.h @@ -15,7 +15,7 @@ void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref); int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now); -void icmp_timer(const struct ctx *c, const struct timespec *ts); +void icmp_timer(const struct ctx *c, const struct timespec *now); void icmp_init(void); /** diff --git a/log.c b/log.c index b206f72..d7f6d35 100644 --- a/log.c +++ b/log.c @@ -216,11 +216,11 @@ void logfile_init(const char *name, const char *path, size_t size) /** * logfile_rotate_fallocate() - Write header, set log_written after fallocate() * @fd: Log file descriptor - * @ts: Current timestamp + * @now: Current timestamp * * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek */ -static void logfile_rotate_fallocate(int fd, const struct timespec *ts) +static void logfile_rotate_fallocate(int fd, const struct timespec *now) { char buf[BUFSIZ], *nl; int n; @@ -232,8 +232,8 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *ts) n = snprintf(buf, BUFSIZ, "%s - log truncated at %lli.%04lli", log_header, - (long long int)(ts->tv_sec - log_start), - (long long int)(ts->tv_nsec / (100L * 1000))); + (long long int)(now->tv_sec - log_start), + (long long int)(now->tv_nsec / (100L * 1000))); /* Avoid partial lines by padding the header with spaces */ nl = memchr(buf + n + 1, '\n', BUFSIZ - n - 1); @@ -252,20 +252,20 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *ts) /** * logfile_rotate_move() - Fallback: move recent entries toward start, then cut * @fd: Log file descriptor - * @ts: Current timestamp + * @now: Current timestamp * * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek * #syscalls ftruncate */ -static void logfile_rotate_move(int fd, const struct timespec *ts) +static void logfile_rotate_move(int fd, const struct timespec *now) { int header_len, write_offset, end, discard, n; char buf[BUFSIZ], *nl; header_len = snprintf(buf, BUFSIZ, "%s - log truncated at %lli.%04lli\n", log_header, - (long long int)(ts->tv_sec - log_start), - (long long int)(ts->tv_nsec / (100L * 1000))); + (long long int)(now->tv_sec - log_start), + (long long int)(now->tv_nsec / (100L * 1000))); if (lseek(fd, 0, SEEK_SET) == -1) return; if (write(fd, buf, header_len) == -1) @@ -314,7 +314,7 @@ out: /** * logfile_rotate() - "Rotate" log file once it's full * @fd: Log file descriptor - * @ts: Current timestamp + * @now: Current timestamp * * Return: 0 on success, negative error code on failure * @@ -322,7 +322,7 @@ out: * * fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there */ -static int logfile_rotate(int fd, const struct timespec *ts) +static int logfile_rotate(int fd, const struct timespec *now) { if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */)) return -errno; @@ -330,10 +330,10 @@ static int logfile_rotate(int fd, const struct timespec *ts) #ifdef FALLOC_FL_COLLAPSE_RANGE /* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */ if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size)) - logfile_rotate_fallocate(fd, ts); + logfile_rotate_fallocate(fd, now); else #endif - logfile_rotate_move(fd, ts); + logfile_rotate_move(fd, now); if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND)) return -errno; @@ -349,16 +349,16 @@ static int logfile_rotate(int fd, const struct timespec *ts) */ void logfile_write(int pri, const char *format, va_list ap) { - struct timespec ts; + struct timespec now; char buf[BUFSIZ]; int n; - if (clock_gettime(CLOCK_REALTIME, &ts)) + if (clock_gettime(CLOCK_REALTIME, &now)) return; n = snprintf(buf, BUFSIZ, "%lli.%04lli: %s", - (long long int)(ts.tv_sec - log_start), - (long long int)(ts.tv_nsec / (100L * 1000)), + (long long int)(now.tv_sec - log_start), + (long long int)(now.tv_nsec / (100L * 1000)), logfile_prefix[pri]); n += vsnprintf(buf + n, BUFSIZ - n, format, ap); @@ -366,7 +366,7 @@ void logfile_write(int pri, const char *format, va_list ap) if (format[strlen(format)] != '\n') n += snprintf(buf + n, BUFSIZ - n, "\n"); - if ((log_written + n >= log_size) && logfile_rotate(log_file, &ts)) + if ((log_written + n >= log_size) && logfile_rotate(log_file, &now)) return; if ((n = write(log_file, buf, n)) >= 0) diff --git a/tcp.c b/tcp.c index ce53ee0..a72a580 100644 --- a/tcp.c +++ b/tcp.c @@ -3173,13 +3173,13 @@ static int tcp_port_rebind_outbound(void *arg) /** * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill * @c: Execution context - * @ts: Unused + * @now: Current timestamp */ -void tcp_timer(struct ctx *c, const struct timespec *ts) +void tcp_timer(struct ctx *c, const struct timespec *now) { union flow *flow; - (void)ts; + (void)now; if (c->mode == MODE_PASTA) { if (c->tcp.fwd_out.mode == FWD_AUTO) { diff --git a/tcp.h b/tcp.h index 27b1166..594d71a 100644 --- a/tcp.h +++ b/tcp.h @@ -20,7 +20,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, const char *ifname, in_port_t port); int tcp_init(struct ctx *c); -void tcp_timer(struct ctx *c, const struct timespec *ts); +void tcp_timer(struct ctx *c, const struct timespec *now); void tcp_defer_handler(struct ctx *c); void tcp_sock_set_bufsize(const struct ctx *c, int s); diff --git a/udp.c b/udp.c index 1f8c306..0b4582c 100644 --- a/udp.c +++ b/udp.c @@ -1140,10 +1140,10 @@ int udp_init(struct ctx *c) * @v6: Set for IPv6 connections * @type: Socket type * @port: Port number, host order - * @ts: Timestamp from caller + * @now: Current timestamp */ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, - in_port_t port, const struct timespec *ts) + in_port_t port, const struct timespec *now) { struct udp_splice_port *sp; struct udp_tap_port *tp; @@ -1153,7 +1153,7 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, case UDP_ACT_TAP: tp = &udp_tap_map[v6 ? V6 : V4][port]; - if (ts->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { + if (now->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { sockp = &tp->sock; tp->flags = 0; } @@ -1162,14 +1162,14 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, case UDP_ACT_SPLICE_INIT: sp = &udp_splice_init[v6 ? V6 : V4][port]; - if (ts->tv_sec - sp->ts > UDP_CONN_TIMEOUT) + if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) sockp = &sp->sock; break; case UDP_ACT_SPLICE_NS: sp = &udp_splice_ns[v6 ? V6 : V4][port]; - if (ts->tv_sec - sp->ts > UDP_CONN_TIMEOUT) + if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) sockp = &sp->sock; break; @@ -1249,9 +1249,9 @@ static int udp_port_rebind_outbound(void *arg) /** * udp_timer() - Scan activity bitmaps for ports with associated timed events * @c: Execution context - * @ts: Timestamp from caller + * @now: Current timestamp */ -void udp_timer(struct ctx *c, const struct timespec *ts) +void udp_timer(struct ctx *c, const struct timespec *now) { int n, t, v6 = 0; unsigned int i; @@ -1281,7 +1281,7 @@ v6: tmp = *word; while ((n = ffsl(tmp))) { tmp &= ~(1UL << (n - 1)); - udp_timer_one(c, v6, t, i * 8 + n - 1, ts); + udp_timer_one(c, v6, t, i * 8 + n - 1, now); } } } diff --git a/udp.h b/udp.h index 85ebaaa..087e482 100644 --- a/udp.h +++ b/udp.h @@ -17,7 +17,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, int af, int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port); int udp_init(struct ctx *c); -void udp_timer(struct ctx *c, const struct timespec *ts); +void udp_timer(struct ctx *c, const struct timespec *now); void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); /** -- 2.43.0
tcp_timer() scans the connection table, expiring "tap" connections and calling tcp_splice_timer() for "splice" connections. tcp_splice_timer() expires spliced connections and then does some other processing. However, tcp_timer() is always called shortly after tcp_defer_handler() (from post_handler()), which also scans the flow table expiring both tap and spliced connections. So remove the redundant handling, and only do the extra tcp_splice_timer() work from tcp_timer(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 15 ++------------- tcp_conn.h | 2 +- tcp_splice.c | 7 ++----- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/tcp.c b/tcp.c index a72a580..084100d 100644 --- a/tcp.c +++ b/tcp.c @@ -3193,20 +3193,9 @@ void tcp_timer(struct ctx *c, const struct timespec *now) } } - for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { - switch (flow->f.type) { - case FLOW_TCP: - if (flow->tcp.events == CLOSED) - tcp_conn_destroy(c, flow); - break; - case FLOW_TCP_SPLICE: + for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) + if (flow->f.type == FLOW_TCP_SPLICE) tcp_splice_timer(c, flow); - break; - default: - die("Unexpected %s in tcp_timer()", - FLOW_TYPE(&flow->f)); - } - } tcp_sock_refill_init(c); if (c->mode == MODE_PASTA) diff --git a/tcp_conn.h b/tcp_conn.h index 3900305..07efe62 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -161,7 +161,7 @@ void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, struct tcp_tap_conn *new); void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); void tcp_splice_destroy(struct ctx *c, union flow *flow); -void tcp_splice_timer(struct ctx *c, union flow *flow); +void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af); diff --git a/tcp_splice.c b/tcp_splice.c index 052f989..cf9b4e8 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -755,15 +755,12 @@ void tcp_splice_init(struct ctx *c) * @c: Execution context * @flow: Flow table entry */ -void tcp_splice_timer(struct ctx *c, union flow *flow) +void tcp_splice_timer(const struct ctx *c, union flow *flow) { struct tcp_splice_conn *conn = &flow->tcp_splice; int side; - if (conn->flags & CLOSING) { - tcp_splice_destroy(c, flow); - return; - } + ASSERT(!(conn->flags & CLOSING)); for (side = 0; side < SIDES; side++) { uint8_t set = side == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1; -- 2.43.0
tcp_conn_destroy() and tcp_splice_destroy() are always called conditionally on the connection being closed or closing. Move that logic into the "destroy" functions themselves, renaming them tcp_flow_defer() and tcp_splice_flow_defer(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 13 +++++++------ tcp_conn.h | 2 +- tcp_splice.c | 9 ++++++--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/tcp.c b/tcp.c index 084100d..fc9176f 100644 --- a/tcp.c +++ b/tcp.c @@ -1300,14 +1300,17 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, } /** - * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction + * tcp_flow_defer() - Deferred per-flow handling (clean up closed connections) * @c: Execution context * @flow: Flow table entry for this connection */ -static void tcp_conn_destroy(struct ctx *c, union flow *flow) +static void tcp_flow_defer(struct ctx *c, union flow *flow) { const struct tcp_tap_conn *conn = &flow->tcp; + if (flow->tcp.events != CLOSED) + return; + close(conn->sock); if (conn->timer != -1) close(conn->timer); @@ -1369,12 +1372,10 @@ void tcp_defer_handler(struct ctx *c) for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { switch (flow->f.type) { case FLOW_TCP: - if (flow->tcp.events == CLOSED) - tcp_conn_destroy(c, flow); + tcp_flow_defer(c, flow); break; case FLOW_TCP_SPLICE: - if (flow->tcp_splice.flags & CLOSING) - tcp_splice_destroy(c, flow); + tcp_splice_flow_defer(c, flow); break; default: die("Unexpected %s in tcp_defer_handler()", diff --git a/tcp_conn.h b/tcp_conn.h index 07efe62..75def3c 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -160,7 +160,7 @@ extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, struct tcp_tap_conn *new); void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); -void tcp_splice_destroy(struct ctx *c, union flow *flow); +void tcp_splice_flow_defer(struct ctx *c, union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); diff --git a/tcp_splice.c b/tcp_splice.c index cf9b4e8..09aa20f 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -243,15 +243,18 @@ void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new) } /** - * tcp_splice_destroy() - Close spliced connection and pipes, clear + * tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed) * @c: Execution context - * @flow: Flow table entry + * @flow: Flow table entry for this connection */ -void tcp_splice_destroy(struct ctx *c, union flow *flow) +void tcp_splice_flow_defer(struct ctx *c, union flow *flow) { struct tcp_splice_conn *conn = &flow->tcp_splice; unsigned side; + if (!(flow->tcp_splice.flags & CLOSING)) + return; + for (side = 0; side < SIDES; side++) { if (conn->events & SPLICE_ESTABLISHED) { /* Flushing might need to block: don't recycle them. */ -- 2.43.0
tcp_defer_handler(), amongst other things, scans the flow table and does some processing for each TCP connection. When we add other protocols to the flow table, they're likely to want some similar scanning. It makes more sense for cache friendliness to perform a single scan of the flow table and dispatch to the protocol specific handlers, rather than having each protocol separately scan the table. To that end, add a new flow_defer_handler() handling all flow-linked deferred operations. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 23 +++++++++++++++++++++++ flow.h | 1 + passt.c | 1 + tcp.c | 19 ++----------------- tcp_conn.h | 1 + 5 files changed, 28 insertions(+), 17 deletions(-) diff --git a/flow.c b/flow.c index a1c0a34..0a0402d 100644 --- a/flow.c +++ b/flow.c @@ -83,3 +83,26 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); } + +/** + * flow_defer_handler() - Handler for per-flow deferred tasks + * @c: Execution context + */ +void flow_defer_handler(struct ctx *c) +{ + union flow *flow; + + for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { + switch (flow->f.type) { + case FLOW_TCP: + tcp_flow_defer(c, flow); + break; + case FLOW_TCP_SPLICE: + tcp_splice_flow_defer(c, flow); + break; + default: + /* Assume other flow types don't need any handling */ + ; + } + } +} diff --git a/flow.h b/flow.h index c2a5190..4733671 100644 --- a/flow.h +++ b/flow.h @@ -56,6 +56,7 @@ static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t), union flow; void flow_table_compact(struct ctx *c, union flow *hole); +void flow_defer_handler(struct ctx *c); void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) __attribute__((format(printf, 3, 4))); diff --git a/passt.c b/passt.c index 0246b04..5f72a28 100644 --- a/passt.c +++ b/passt.c @@ -103,6 +103,7 @@ static void post_handler(struct ctx *c, const struct timespec *now) /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, icmp, ICMP); + flow_defer_handler(c); #undef CALL_PROTO_HANDLER } diff --git a/tcp.c b/tcp.c index fc9176f..0e8e9e3 100644 --- a/tcp.c +++ b/tcp.c @@ -1304,7 +1304,7 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, * @c: Execution context * @flow: Flow table entry for this connection */ -static void tcp_flow_defer(struct ctx *c, union flow *flow) +void tcp_flow_defer(struct ctx *c, union flow *flow) { const struct tcp_tap_conn *conn = &flow->tcp; @@ -1362,26 +1362,11 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context */ +/* cppcheck-suppress constParameterPointer */ void tcp_defer_handler(struct ctx *c) { - union flow *flow; - tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); - - for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { - switch (flow->f.type) { - case FLOW_TCP: - tcp_flow_defer(c, flow); - break; - case FLOW_TCP_SPLICE: - tcp_splice_flow_defer(c, flow); - break; - default: - die("Unexpected %s in tcp_defer_handler()", - FLOW_TYPE(&flow->f)); - } - } } /** diff --git a/tcp_conn.h b/tcp_conn.h index 75def3c..99a3b3b 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -160,6 +160,7 @@ extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, struct tcp_tap_conn *new); void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); +void tcp_flow_defer(struct ctx *c, union flow *flow); void tcp_splice_flow_defer(struct ctx *c, union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); -- 2.43.0
tcp_timer() scans the flow table so that it can run tcp_splice_timer() on each spliced connection. More generally, other flow types might want to run similar timers in future. We could add a flow_timer() analagous to tcp_timer(), udp_timer() etc. However, this would need to scan the flow table, which we would have just done in flow_defer_handler(). We'd prefer to just scan the flow table once, dispatching both per-flow deferred events and per-flow timed events if necessary. So, extend flow_defer_handler() to do this. For now we use the same timer interval for all flow types (1s). We can make that more flexible in future if we need to. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 13 +++++++++++-- flow.h | 4 +++- passt.c | 7 ++++--- passt.h | 2 ++ tcp.c | 6 ------ 5 files changed, 20 insertions(+), 12 deletions(-) diff --git a/flow.c b/flow.c index 0a0402d..39166ae 100644 --- a/flow.c +++ b/flow.c @@ -85,13 +85,20 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) } /** - * flow_defer_handler() - Handler for per-flow deferred tasks + * flow_defer_handler() - Handler for per-flow deferred and timed tasks * @c: Execution context + * @now: Current timestamp */ -void flow_defer_handler(struct ctx *c) +void flow_defer_handler(struct ctx *c, const struct timespec *now) { + bool timer = false; union flow *flow; + if (timespec_diff_ms(now, &c->flow_timer_run) >= FLOW_TIMER_INTERVAL) { + timer = true; + c->flow_timer_run = *now; + } + for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { switch (flow->f.type) { case FLOW_TCP: @@ -99,6 +106,8 @@ void flow_defer_handler(struct ctx *c) break; case FLOW_TCP_SPLICE: tcp_splice_flow_defer(c, flow); + if (timer) + tcp_splice_timer(c, flow); break; default: /* Assume other flow types don't need any handling */ diff --git a/flow.h b/flow.h index 4733671..c368a69 100644 --- a/flow.h +++ b/flow.h @@ -7,6 +7,8 @@ #ifndef FLOW_H #define FLOW_H +#define FLOW_TIMER_INTERVAL 1000 /* ms */ + /** * enum flow_type - Different types of packet flows we track */ @@ -56,7 +58,7 @@ static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t), union flow; void flow_table_compact(struct ctx *c, union flow *hole); -void flow_defer_handler(struct ctx *c); +void flow_defer_handler(struct ctx *c, const struct timespec *now); void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) __attribute__((format(printf, 3, 4))); diff --git a/passt.c b/passt.c index 5f72a28..870064f 100644 --- a/passt.c +++ b/passt.c @@ -53,8 +53,9 @@ #define EPOLL_EVENTS 8 -#define __TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL) -#define TIMER_INTERVAL MIN(__TIMER_INTERVAL, ICMP_TIMER_INTERVAL) +#define TIMER_INTERVAL__ MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL) +#define TIMER_INTERVAL_ MIN(TIMER_INTERVAL__, ICMP_TIMER_INTERVAL) +#define TIMER_INTERVAL MIN(TIMER_INTERVAL_, FLOW_TIMER_INTERVAL) char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE))); @@ -103,7 +104,7 @@ static void post_handler(struct ctx *c, const struct timespec *now) /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, icmp, ICMP); - flow_defer_handler(c); + flow_defer_handler(c, now); #undef CALL_PROTO_HANDLER } diff --git a/passt.h b/passt.h index c74887a..20528d5 100644 --- a/passt.h +++ b/passt.h @@ -223,6 +223,7 @@ struct ip6_ctx { * @no_copy_routes: Don't copy all routes when configuring target namespace * @no_copy_addrs: Don't copy all addresses when configuring namespace * @flow_count: Number of tracked packet flows (connections etc.) + * @flow_timer_run: Timestamp of most recent flow timer run * @no_tcp: Disable TCP operation * @tcp: Context for TCP protocol handler * @no_tcp: Disable UDP operation @@ -283,6 +284,7 @@ struct ctx { int no_copy_addrs; unsigned flow_count; + struct timespec flow_timer_run; int no_tcp; struct tcp_ctx tcp; diff --git a/tcp.c b/tcp.c index 0e8e9e3..95ec760 100644 --- a/tcp.c +++ b/tcp.c @@ -3163,8 +3163,6 @@ static int tcp_port_rebind_outbound(void *arg) */ void tcp_timer(struct ctx *c, const struct timespec *now) { - union flow *flow; - (void)now; if (c->mode == MODE_PASTA) { @@ -3179,10 +3177,6 @@ void tcp_timer(struct ctx *c, const struct timespec *now) } } - for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) - if (flow->f.type == FLOW_TCP_SPLICE) - tcp_splice_timer(c, flow); - tcp_sock_refill_init(c); if (c->mode == MODE_PASTA) tcp_splice_refill(c); -- 2.43.0
As we already did for flow types, use an "EPOLL_NUM_TYPES" isntead of EPOLL_TYPE_MAX, which is a little bit safer and clearer. Add a static assert on the size of the matching names array. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- passt.c | 4 +++- passt.h | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/passt.c b/passt.c index 870064f..a37a2f4 100644 --- a/passt.c +++ b/passt.c @@ -59,7 +59,7 @@ char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE))); -char *epoll_type_str[EPOLL_TYPE_MAX + 1] = { +char *epoll_type_str[] = { [EPOLL_TYPE_TCP] = "connected TCP socket", [EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket", [EPOLL_TYPE_TCP_TIMER] = "TCP timer", @@ -71,6 +71,8 @@ char *epoll_type_str[EPOLL_TYPE_MAX + 1] = { [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", }; +static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, + "epoll_type_str[] doesn't match enum epoll_type"); /** * post_handler() - Run periodic and deferred tasks for L4 protocol handlers diff --git a/passt.h b/passt.h index 20528d5..188ad2c 100644 --- a/passt.h +++ b/passt.h @@ -70,7 +70,7 @@ enum epoll_type { /* socket listening for qemu socket connections */ EPOLL_TYPE_TAP_LISTEN, - EPOLL_TYPE_MAX = EPOLL_TYPE_TAP_LISTEN, + EPOLL_NUM_TYPES, }; /** @@ -115,7 +115,7 @@ extern char pkt_buf [PKT_BUF_BYTES]; extern char *epoll_type_str[]; #define EPOLL_TYPE_STR(n) \ - (((uint8_t)(n) <= EPOLL_TYPE_MAX && epoll_type_str[(n)]) ? \ + (((uint8_t)(n) < EPOLL_NUM_TYPES && epoll_type_str[(n)]) ? \ epoll_type_str[(n)] : "?") #include <resolv.h> /* For MAXNS below */ -- 2.43.0
Currently connected TCP sockets have the same epoll type, whether they're for a "tap" connection or a spliced connection. This means that tcp_sock_handler() has to do a secondary check on the type of the connection to call the right function. We can avoid this by adding a new epoll type and dispatching directly to the right thing. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- passt.c | 8 ++++++-- passt.h | 2 ++ tcp.c | 36 ++++++++---------------------------- tcp_splice.c | 16 +++++++++------- tcp_splice.h | 4 ++-- 5 files changed, 27 insertions(+), 39 deletions(-) diff --git a/passt.c b/passt.c index a37a2f4..71bea8f 100644 --- a/passt.c +++ b/passt.c @@ -50,6 +50,7 @@ #include "pasta.h" #include "arch.h" #include "log.h" +#include "tcp_splice.h" #define EPOLL_EVENTS 8 @@ -61,6 +62,7 @@ char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE))); char *epoll_type_str[] = { [EPOLL_TYPE_TCP] = "connected TCP socket", + [EPOLL_TYPE_TCP_SPLICE] = "connected spliced TCP socket", [EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket", [EPOLL_TYPE_TCP_TIMER] = "TCP timer", [EPOLL_TYPE_UDP] = "UDP socket", @@ -373,8 +375,10 @@ loop: pasta_netns_quit_handler(&c, quit_fd); break; case EPOLL_TYPE_TCP: - if (!c.no_tcp) - tcp_sock_handler(&c, ref, eventmask); + tcp_sock_handler(&c, ref, eventmask); + break; + case EPOLL_TYPE_TCP_SPLICE: + tcp_splice_sock_handler(&c, ref, eventmask); break; case EPOLL_TYPE_TCP_LISTEN: tcp_listen_handler(&c, ref, &now); diff --git a/passt.h b/passt.h index 188ad2c..bd7e275 100644 --- a/passt.h +++ b/passt.h @@ -51,6 +51,8 @@ enum epoll_type { EPOLL_TYPE_NONE = 0, /* Connected TCP sockets */ EPOLL_TYPE_TCP, + /* Connected TCP sockets (spliced) */ + EPOLL_TYPE_TCP_SPLICE, /* Listening TCP sockets */ EPOLL_TYPE_TCP_LISTEN, /* timerfds used for TCP timers */ diff --git a/tcp.c b/tcp.c index 95ec760..de40cf0 100644 --- a/tcp.c +++ b/tcp.c @@ -2801,14 +2801,18 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) } /** - * tcp_tap_sock_handler() - Handle new data from non-spliced socket + * tcp_sock_handler() - Handle new data from non-spliced socket * @c: Execution context - * @conn: Connection state + * @ref: epoll reference * @events: epoll events bitmap */ -static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, - uint32_t events) +void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) { + struct tcp_tap_conn *conn = CONN(ref.flowside.flow); + + ASSERT(conn->f.type == FLOW_TCP); + ASSERT(ref.flowside.side == SOCKSIDE); + if (conn->events == CLOSED) return; @@ -2855,30 +2859,6 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, } } -/** - * tcp_sock_handler() - Handle new data from socket, or timerfd event - * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap - */ -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) -{ - union flow *flow = FLOW(ref.flowside.flow); - - switch (flow->f.type) { - case FLOW_TCP: - tcp_tap_sock_handler(c, &flow->tcp, events); - break; - case FLOW_TCP_SPLICE: - tcp_splice_sock_handler(c, &flow->tcp_splice, ref.flowside.side, - events); - break; - default: - die("Unexpected %s in tcp_sock_handler_compact()", - FLOW_TYPE(&flow->f)); - } -} - /** * tcp_sock_init_af() - Initialise listening socket for a given af and port * @c: Execution context diff --git a/tcp_splice.c b/tcp_splice.c index 09aa20f..9b1d331 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -127,9 +127,9 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref[SIDES] = { - { .type = EPOLL_TYPE_TCP, .fd = conn->s[0], + { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0], .flowside = FLOW_SIDX(conn, 0) }, - { .type = EPOLL_TYPE_TCP, .fd = conn->s[1], + { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1], .flowside = FLOW_SIDX(conn, 1) } }; struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, @@ -484,18 +484,20 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, /** * tcp_splice_sock_handler() - Handler for socket mapped to spliced connection * @c: Execution context - * @conn: Connection state - * @side: Side of the connection on which an event has occurred + * @ref: epoll reference * @events: epoll events bitmap * * #syscalls:pasta splice */ -void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, - int side, uint32_t events) +void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, + uint32_t events) { + struct tcp_splice_conn *conn = CONN(ref.flowside.flow); + unsigned side = ref.flowside.side, fromside; uint8_t lowat_set_flag, lowat_act_flag; int eof, never_read; - unsigned fromside; + + ASSERT(conn->f.type == FLOW_TCP_SPLICE); if (conn->events == SPLICE_CLOSED) return; diff --git a/tcp_splice.h b/tcp_splice.h index aa85c7c..18193e4 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -8,8 +8,8 @@ struct tcp_splice_conn; -void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, - int side, uint32_t events); +void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, + uint32_t events); bool tcp_splice_conn_from_sock(const struct ctx *c, union tcp_listen_epoll_ref ref, struct tcp_splice_conn *conn, int s, -- 2.43.0