There are a number of things that are more-or-less general to flows which are still explicitly handled in tcp.c and tcp_splice.c including allocation and freeing of flow entries, and dispatch of deferred and timer functions. Even without adding more fields to the common flow structure, we can handle a number of these in a more flow-centric way. Unlike v1 this version is based on the hash table rework series. Changes since v3: * Rebased on current tree, plus patches for some test failures * Renamed "free blocks" to "free clusters" in flow allocation scheme * Fixed bug where we wouldn't properly merge adjacent free clusters together during deferred handling * Improved theory of operation comments about flow allocation scheme Changes since v2: * Realised the prealloc/commit functions where confusing and worked poorly for some future stuff. Replaced with alloc/alloc_cancel * Fixed a bug where newly allocated flow entries might not be 0-filled, because of the free tracking information in there. This could cause very subtle problems. Changes since v1: * Store the timestamp of last flow timers run in a global, rather than a ctx field * Rebased on the TCP hash table rework * Add patches 9..13/13 with changes to allocation and freeing of flow entries. David Gibson (13): flow: Make flow_table.h #include the protocol specific headers it needs treewide: Standardise on 'now' for current timestamp variables tcp, tcp_splice: Remove redundant handling from tcp_timer() tcp, tcp_splice: Move per-type cleanup logic into per-type helpers flow, tcp: Add flow-centric dispatch for deferred flow handling flow, tcp: Add handling for per-flow timers epoll: Better handling of number of epoll types tcp, tcp_splice: Avoid double layered dispatch for connected TCP sockets flow: Move flow_log_() to near top of flow.c flow: Move flow_count from context structure to a global flow: Abstract allocation of new flows with helper function flow: Enforce that freeing of closed flows must happen in deferred handlers flow: Avoid moving flow entries to compact table flow.c | 235 +++++++++++++++++++++++++++++++++++++++++++-------- flow.h | 5 +- flow_table.h | 20 +++++ icmp.c | 12 +-- icmp.h | 2 +- log.c | 34 ++++---- passt.c | 20 +++-- passt.h | 9 +- tcp.c | 143 +++++++++---------------------- tcp.h | 2 +- tcp_conn.h | 8 +- tcp_splice.c | 49 +++++------ tcp_splice.h | 4 +- udp.c | 16 ++-- udp.h | 2 +- 15 files changed, 339 insertions(+), 222 deletions(-) -- 2.43.0
flow_table.h, the lower level flow header relies on having the struct definitions for every protocol specific flow type - so far that means tcp_conn.h. It doesn't include it itself, so tcp_conn.h must be included before flow_table.h. That's ok for now, but as we use the flow table for more things, flow_table.h will need the structs for all of them, which means the protocol specific .c files would need to include tcp_conn.h _and_ the equivalents for every other flow type before flow_table.h every time, which is weird. So, although we *mostly* lean towards the include style where .c files need to handle the include dependencies, in this case it makes more sense to have flow_table.h include all the protocol specific headers it needs. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 1 - flow_table.h | 2 ++ tcp.c | 1 - tcp_splice.c | 1 - 4 files changed, 2 insertions(+), 3 deletions(-) diff --git a/flow.c b/flow.c index a8aeb15..c3f4886 100644 --- a/flow.c +++ b/flow.c @@ -15,7 +15,6 @@ #include "siphash.h" #include "inany.h" #include "flow.h" -#include "tcp_conn.h" #include "flow_table.h" const char *flow_type_str[] = { diff --git a/flow_table.h b/flow_table.h index 0dee66f..e805f10 100644 --- a/flow_table.h +++ b/flow_table.h @@ -7,6 +7,8 @@ #ifndef FLOW_TABLE_H #define FLOW_TABLE_H +#include "tcp_conn.h" + /** * union flow - Descriptor for a logical packet flow (e.g. connection) * @f: Fields common between all variants diff --git a/tcp.c b/tcp.c index 5b37662..22c3a0e 100644 --- a/tcp.c +++ b/tcp.c @@ -298,7 +298,6 @@ #include "inany.h" #include "flow.h" -#include "tcp_conn.h" #include "flow_table.h" /* Sides of a flow as we use them in "tap" connections */ diff --git a/tcp_splice.c b/tcp_splice.c index 0e2e04c..a91cb37 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -56,7 +56,6 @@ #include "inany.h" #include "flow.h" -#include "tcp_conn.h" #include "flow_table.h" #define MAX_PIPE_SIZE (8UL * 1024 * 1024) -- 2.43.0
In a number of places we pass around a struct timespec representing the (more or less) current time. Sometimes we call it 'now', and sometimes we call it 'ts'. Standardise on the more informative 'now'. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 12 ++++++------ icmp.h | 2 +- log.c | 34 +++++++++++++++++----------------- tcp.c | 6 +++--- tcp.h | 2 +- udp.c | 16 ++++++++-------- udp.h | 2 +- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/icmp.c b/icmp.c index c82efd0..325dfb0 100644 --- a/icmp.c +++ b/icmp.c @@ -275,14 +275,14 @@ fail_sock: * @c: Execution context * @v6: Set for IPv6 echo identifier bindings * @id: Echo identifier, host order - * @ts: Timestamp from caller + * @now: Current timestamp */ static void icmp_timer_one(const struct ctx *c, int v6, uint16_t id, - const struct timespec *ts) + const struct timespec *now) { struct icmp_id_sock *id_map = &icmp_id_map[v6 ? V6 : V4][id]; - if (ts->tv_sec - id_map->ts <= ICMP_ECHO_TIMEOUT) + if (now->tv_sec - id_map->ts <= ICMP_ECHO_TIMEOUT) return; bitmap_clear(icmp_act[v6 ? V6 : V4], id); @@ -296,9 +296,9 @@ static void icmp_timer_one(const struct ctx *c, int v6, uint16_t id, /** * icmp_timer() - Scan activity bitmap for identifiers with timed events * @c: Execution context - * @ts: Timestamp from caller + * @now: Current timestamp */ -void icmp_timer(const struct ctx *c, const struct timespec *ts) +void icmp_timer(const struct ctx *c, const struct timespec *now) { long *word, tmp; unsigned int i; @@ -310,7 +310,7 @@ v6: tmp = *word; while ((n = ffsl(tmp))) { tmp &= ~(1UL << (n - 1)); - icmp_timer_one(c, v6, i * 8 + n - 1, ts); + icmp_timer_one(c, v6, i * 8 + n - 1, now); } } diff --git a/icmp.h b/icmp.h index 44cc495..1a08594 100644 --- a/icmp.h +++ b/icmp.h @@ -15,7 +15,7 @@ void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref); int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now); -void icmp_timer(const struct ctx *c, const struct timespec *ts); +void icmp_timer(const struct ctx *c, const struct timespec *now); void icmp_init(void); /** diff --git a/log.c b/log.c index f71d0e2..4a70e29 100644 --- a/log.c +++ b/log.c @@ -216,11 +216,11 @@ void logfile_init(const char *name, const char *path, size_t size) /** * logfile_rotate_fallocate() - Write header, set log_written after fallocate() * @fd: Log file descriptor - * @ts: Current timestamp + * @now: Current timestamp * * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek */ -static void logfile_rotate_fallocate(int fd, const struct timespec *ts) +static void logfile_rotate_fallocate(int fd, const struct timespec *now) { char buf[BUFSIZ]; const char *nl; @@ -233,8 +233,8 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *ts) n = snprintf(buf, BUFSIZ, "%s - log truncated at %lli.%04lli", log_header, - (long long int)(ts->tv_sec - log_start), - (long long int)(ts->tv_nsec / (100L * 1000))); + (long long int)(now->tv_sec - log_start), + (long long int)(now->tv_nsec / (100L * 1000))); /* Avoid partial lines by padding the header with spaces */ nl = memchr(buf + n + 1, '\n', BUFSIZ - n - 1); @@ -253,12 +253,12 @@ static void logfile_rotate_fallocate(int fd, const struct timespec *ts) /** * logfile_rotate_move() - Fallback: move recent entries toward start, then cut * @fd: Log file descriptor - * @ts: Current timestamp + * @now: Current timestamp * * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek * #syscalls ftruncate */ -static void logfile_rotate_move(int fd, const struct timespec *ts) +static void logfile_rotate_move(int fd, const struct timespec *now) { int header_len, write_offset, end, discard, n; char buf[BUFSIZ]; @@ -266,8 +266,8 @@ static void logfile_rotate_move(int fd, const struct timespec *ts) header_len = snprintf(buf, BUFSIZ, "%s - log truncated at %lli.%04lli\n", log_header, - (long long int)(ts->tv_sec - log_start), - (long long int)(ts->tv_nsec / (100L * 1000))); + (long long int)(now->tv_sec - log_start), + (long long int)(now->tv_nsec / (100L * 1000))); if (lseek(fd, 0, SEEK_SET) == -1) return; if (write(fd, buf, header_len) == -1) @@ -316,7 +316,7 @@ out: /** * logfile_rotate() - "Rotate" log file once it's full * @fd: Log file descriptor - * @ts: Current timestamp + * @now: Current timestamp * * Return: 0 on success, negative error code on failure * @@ -324,7 +324,7 @@ out: * * fallocate() passed as EXTRA_SYSCALL only if FALLOC_FL_COLLAPSE_RANGE is there */ -static int logfile_rotate(int fd, const struct timespec *ts) +static int logfile_rotate(int fd, const struct timespec *now) { if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */)) return -errno; @@ -332,10 +332,10 @@ static int logfile_rotate(int fd, const struct timespec *ts) #ifdef FALLOC_FL_COLLAPSE_RANGE /* Only for Linux >= 3.15, extent-based ext4 or XFS, glibc >= 2.18 */ if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size)) - logfile_rotate_fallocate(fd, ts); + logfile_rotate_fallocate(fd, now); else #endif - logfile_rotate_move(fd, ts); + logfile_rotate_move(fd, now); if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND)) return -errno; @@ -351,16 +351,16 @@ static int logfile_rotate(int fd, const struct timespec *ts) */ void logfile_write(int pri, const char *format, va_list ap) { - struct timespec ts; + struct timespec now; char buf[BUFSIZ]; int n; - if (clock_gettime(CLOCK_REALTIME, &ts)) + if (clock_gettime(CLOCK_REALTIME, &now)) return; n = snprintf(buf, BUFSIZ, "%lli.%04lli: %s", - (long long int)(ts.tv_sec - log_start), - (long long int)(ts.tv_nsec / (100L * 1000)), + (long long int)(now.tv_sec - log_start), + (long long int)(now.tv_nsec / (100L * 1000)), logfile_prefix[pri]); n += vsnprintf(buf + n, BUFSIZ - n, format, ap); @@ -368,7 +368,7 @@ void logfile_write(int pri, const char *format, va_list ap) if (format[strlen(format)] != '\n') n += snprintf(buf + n, BUFSIZ - n, "\n"); - if ((log_written + n >= log_size) && logfile_rotate(log_file, &ts)) + if ((log_written + n >= log_size) && logfile_rotate(log_file, &now)) return; if ((n = write(log_file, buf, n)) >= 0) diff --git a/tcp.c b/tcp.c index 22c3a0e..dbf5509 100644 --- a/tcp.c +++ b/tcp.c @@ -3181,13 +3181,13 @@ static int tcp_port_rebind_outbound(void *arg) /** * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill * @c: Execution context - * @ts: Unused + * @now: Current timestamp */ -void tcp_timer(struct ctx *c, const struct timespec *ts) +void tcp_timer(struct ctx *c, const struct timespec *now) { union flow *flow; - (void)ts; + (void)now; if (c->mode == MODE_PASTA) { if (c->tcp.fwd_out.mode == FWD_AUTO) { diff --git a/tcp.h b/tcp.h index 87a6bf9..b9f546d 100644 --- a/tcp.h +++ b/tcp.h @@ -20,7 +20,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, const char *ifname, in_port_t port); int tcp_init(struct ctx *c); -void tcp_timer(struct ctx *c, const struct timespec *ts); +void tcp_timer(struct ctx *c, const struct timespec *now); void tcp_defer_handler(struct ctx *c); void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); diff --git a/udp.c b/udp.c index 252d353..b5b8f8a 100644 --- a/udp.c +++ b/udp.c @@ -1138,10 +1138,10 @@ int udp_init(struct ctx *c) * @v6: Set for IPv6 connections * @type: Socket type * @port: Port number, host order - * @ts: Timestamp from caller + * @now: Current timestamp */ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, - in_port_t port, const struct timespec *ts) + in_port_t port, const struct timespec *now) { struct udp_splice_port *sp; struct udp_tap_port *tp; @@ -1151,7 +1151,7 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, case UDP_ACT_TAP: tp = &udp_tap_map[v6 ? V6 : V4][port]; - if (ts->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { + if (now->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { sockp = &tp->sock; tp->flags = 0; } @@ -1160,14 +1160,14 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, case UDP_ACT_SPLICE_INIT: sp = &udp_splice_init[v6 ? V6 : V4][port]; - if (ts->tv_sec - sp->ts > UDP_CONN_TIMEOUT) + if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) sockp = &sp->sock; break; case UDP_ACT_SPLICE_NS: sp = &udp_splice_ns[v6 ? V6 : V4][port]; - if (ts->tv_sec - sp->ts > UDP_CONN_TIMEOUT) + if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) sockp = &sp->sock; break; @@ -1247,9 +1247,9 @@ static int udp_port_rebind_outbound(void *arg) /** * udp_timer() - Scan activity bitmaps for ports with associated timed events * @c: Execution context - * @ts: Timestamp from caller + * @now: Current timestamp */ -void udp_timer(struct ctx *c, const struct timespec *ts) +void udp_timer(struct ctx *c, const struct timespec *now) { int n, t, v6 = 0; unsigned int i; @@ -1279,7 +1279,7 @@ v6: tmp = *word; while ((n = ffsl(tmp))) { tmp &= ~(1UL << (n - 1)); - udp_timer_one(c, v6, t, i * 8 + n - 1, ts); + udp_timer_one(c, v6, t, i * 8 + n - 1, now); } } } diff --git a/udp.h b/udp.h index 85ebaaa..087e482 100644 --- a/udp.h +++ b/udp.h @@ -17,7 +17,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif, int af, int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port); int udp_init(struct ctx *c); -void udp_timer(struct ctx *c, const struct timespec *ts); +void udp_timer(struct ctx *c, const struct timespec *now); void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); /** -- 2.43.0
tcp_timer() scans the connection table, expiring "tap" connections and calling tcp_splice_timer() for "splice" connections. tcp_splice_timer() expires spliced connections and then does some other processing. However, tcp_timer() is always called shortly after tcp_defer_handler() (from post_handler()), which also scans the flow table expiring both tap and spliced connections. So remove the redundant handling, and only do the extra tcp_splice_timer() work from tcp_timer(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 15 ++------------- tcp_conn.h | 2 +- tcp_splice.c | 7 ++----- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/tcp.c b/tcp.c index dbf5509..1b29661 100644 --- a/tcp.c +++ b/tcp.c @@ -3201,20 +3201,9 @@ void tcp_timer(struct ctx *c, const struct timespec *now) } } - for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { - switch (flow->f.type) { - case FLOW_TCP: - if (flow->tcp.events == CLOSED) - tcp_conn_destroy(c, flow); - break; - case FLOW_TCP_SPLICE: + for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) + if (flow->f.type == FLOW_TCP_SPLICE) tcp_splice_timer(c, flow); - break; - default: - die("Unexpected %s in tcp_timer()", - FLOW_TYPE(&flow->f)); - } - } tcp_sock_refill_init(c); if (c->mode == MODE_PASTA) diff --git a/tcp_conn.h b/tcp_conn.h index e3400bb..e98559c 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -159,7 +159,7 @@ void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, struct tcp_tap_conn *new); void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); void tcp_splice_destroy(struct ctx *c, union flow *flow); -void tcp_splice_timer(struct ctx *c, union flow *flow); +void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af); diff --git a/tcp_splice.c b/tcp_splice.c index a91cb37..a187136 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -755,15 +755,12 @@ void tcp_splice_init(struct ctx *c) * @c: Execution context * @flow: Flow table entry */ -void tcp_splice_timer(struct ctx *c, union flow *flow) +void tcp_splice_timer(const struct ctx *c, union flow *flow) { struct tcp_splice_conn *conn = &flow->tcp_splice; int side; - if (conn->flags & CLOSING) { - tcp_splice_destroy(c, flow); - return; - } + ASSERT(!(conn->flags & CLOSING)); for (side = 0; side < SIDES; side++) { uint8_t set = side == 0 ? RCVLOWAT_SET_0 : RCVLOWAT_SET_1; -- 2.43.0
tcp_conn_destroy() and tcp_splice_destroy() are always called conditionally on the connection being closed or closing. Move that logic into the "destroy" functions themselves, renaming them tcp_flow_defer() and tcp_splice_flow_defer(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 13 +++++++------ tcp_conn.h | 2 +- tcp_splice.c | 9 ++++++--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/tcp.c b/tcp.c index 1b29661..5223825 100644 --- a/tcp.c +++ b/tcp.c @@ -1303,14 +1303,17 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, } /** - * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction + * tcp_flow_defer() - Deferred per-flow handling (clean up closed connections) * @c: Execution context * @flow: Flow table entry for this connection */ -static void tcp_conn_destroy(struct ctx *c, union flow *flow) +static void tcp_flow_defer(struct ctx *c, union flow *flow) { const struct tcp_tap_conn *conn = &flow->tcp; + if (flow->tcp.events != CLOSED) + return; + close(conn->sock); if (conn->timer != -1) close(conn->timer); @@ -1372,12 +1375,10 @@ void tcp_defer_handler(struct ctx *c) for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { switch (flow->f.type) { case FLOW_TCP: - if (flow->tcp.events == CLOSED) - tcp_conn_destroy(c, flow); + tcp_flow_defer(c, flow); break; case FLOW_TCP_SPLICE: - if (flow->tcp_splice.flags & CLOSING) - tcp_splice_destroy(c, flow); + tcp_splice_flow_defer(c, flow); break; default: die("Unexpected %s in tcp_defer_handler()", diff --git a/tcp_conn.h b/tcp_conn.h index e98559c..4846565 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -158,7 +158,7 @@ extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, struct tcp_tap_conn *new); void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); -void tcp_splice_destroy(struct ctx *c, union flow *flow); +void tcp_splice_flow_defer(struct ctx *c, union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); diff --git a/tcp_splice.c b/tcp_splice.c index a187136..a7d577a 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -243,15 +243,18 @@ void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new) } /** - * tcp_splice_destroy() - Close spliced connection and pipes, clear + * tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed) * @c: Execution context - * @flow: Flow table entry + * @flow: Flow table entry for this connection */ -void tcp_splice_destroy(struct ctx *c, union flow *flow) +void tcp_splice_flow_defer(struct ctx *c, union flow *flow) { struct tcp_splice_conn *conn = &flow->tcp_splice; unsigned side; + if (!(flow->tcp_splice.flags & CLOSING)) + return; + for (side = 0; side < SIDES; side++) { if (conn->events & SPLICE_ESTABLISHED) { /* Flushing might need to block: don't recycle them. */ -- 2.43.0
tcp_defer_handler(), amongst other things, scans the flow table and does some processing for each TCP connection. When we add other protocols to the flow table, they're likely to want some similar scanning. It makes more sense for cache friendliness to perform a single scan of the flow table and dispatch to the protocol specific handlers, rather than having each protocol separately scan the table. To that end, add a new flow_defer_handler() handling all flow-linked deferred operations. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 23 +++++++++++++++++++++++ flow.h | 1 + passt.c | 1 + tcp.c | 19 ++----------------- tcp_conn.h | 1 + 5 files changed, 28 insertions(+), 17 deletions(-) diff --git a/flow.c b/flow.c index c3f4886..4dc2767 100644 --- a/flow.c +++ b/flow.c @@ -84,3 +84,26 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); } + +/** + * flow_defer_handler() - Handler for per-flow deferred tasks + * @c: Execution context + */ +void flow_defer_handler(struct ctx *c) +{ + union flow *flow; + + for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { + switch (flow->f.type) { + case FLOW_TCP: + tcp_flow_defer(c, flow); + break; + case FLOW_TCP_SPLICE: + tcp_splice_flow_defer(c, flow); + break; + default: + /* Assume other flow types don't need any handling */ + ; + } + } +} diff --git a/flow.h b/flow.h index 959b461..6b17fa8 100644 --- a/flow.h +++ b/flow.h @@ -67,6 +67,7 @@ static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) union flow; void flow_table_compact(struct ctx *c, union flow *hole); +void flow_defer_handler(struct ctx *c); void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) __attribute__((format(printf, 3, 4))); diff --git a/passt.c b/passt.c index 0246b04..5f72a28 100644 --- a/passt.c +++ b/passt.c @@ -103,6 +103,7 @@ static void post_handler(struct ctx *c, const struct timespec *now) /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, icmp, ICMP); + flow_defer_handler(c); #undef CALL_PROTO_HANDLER } diff --git a/tcp.c b/tcp.c index 5223825..7065531 100644 --- a/tcp.c +++ b/tcp.c @@ -1307,7 +1307,7 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, * @c: Execution context * @flow: Flow table entry for this connection */ -static void tcp_flow_defer(struct ctx *c, union flow *flow) +void tcp_flow_defer(struct ctx *c, union flow *flow) { const struct tcp_tap_conn *conn = &flow->tcp; @@ -1365,26 +1365,11 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context */ +/* cppcheck-suppress [constParameterPointer, unmatchedSuppression] */ void tcp_defer_handler(struct ctx *c) { - union flow *flow; - tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); - - for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { - switch (flow->f.type) { - case FLOW_TCP: - tcp_flow_defer(c, flow); - break; - case FLOW_TCP_SPLICE: - tcp_splice_flow_defer(c, flow); - break; - default: - die("Unexpected %s in tcp_defer_handler()", - FLOW_TYPE(&flow->f)); - } - } } /** diff --git a/tcp_conn.h b/tcp_conn.h index 4846565..72b9ecb 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -158,6 +158,7 @@ extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, struct tcp_tap_conn *new); void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); +void tcp_flow_defer(struct ctx *c, union flow *flow); void tcp_splice_flow_defer(struct ctx *c, union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); -- 2.43.0
tcp_timer() scans the flow table so that it can run tcp_splice_timer() on each spliced connection. More generally, other flow types might want to run similar timers in future. We could add a flow_timer() analagous to tcp_timer(), udp_timer() etc. However, this would need to scan the flow table, which we would have just done in flow_defer_handler(). We'd prefer to just scan the flow table once, dispatching both per-flow deferred events and per-flow timed events if necessary. So, extend flow_defer_handler() to do this. For now we use the same timer interval for all flow types (1s). We can make that more flexible in future if we need to. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 16 ++++++++++++++-- flow.h | 4 +++- passt.c | 7 ++++--- tcp.c | 6 ------ 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/flow.c b/flow.c index 4dc2767..5dd5d2b 100644 --- a/flow.c +++ b/flow.c @@ -28,6 +28,9 @@ static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, /* Global Flow Table */ union flow flowtab[FLOW_MAX]; +/* Last time the flow timers ran */ +static struct timespec flow_timer_run; + /** * flow_table_compact() - Perform compaction on flow table * @c: Execution context @@ -86,13 +89,20 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) } /** - * flow_defer_handler() - Handler for per-flow deferred tasks + * flow_defer_handler() - Handler for per-flow deferred and timed tasks * @c: Execution context + * @now: Current timestamp */ -void flow_defer_handler(struct ctx *c) +void flow_defer_handler(struct ctx *c, const struct timespec *now) { + bool timer = false; union flow *flow; + if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) { + timer = true; + flow_timer_run = *now; + } + for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { switch (flow->f.type) { case FLOW_TCP: @@ -100,6 +110,8 @@ void flow_defer_handler(struct ctx *c) break; case FLOW_TCP_SPLICE: tcp_splice_flow_defer(c, flow); + if (timer) + tcp_splice_timer(c, flow); break; default: /* Assume other flow types don't need any handling */ diff --git a/flow.h b/flow.h index 6b17fa8..423e685 100644 --- a/flow.h +++ b/flow.h @@ -7,6 +7,8 @@ #ifndef FLOW_H #define FLOW_H +#define FLOW_TIMER_INTERVAL 1000 /* ms */ + /** * enum flow_type - Different types of packet flows we track */ @@ -67,7 +69,7 @@ static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) union flow; void flow_table_compact(struct ctx *c, union flow *hole); -void flow_defer_handler(struct ctx *c); +void flow_defer_handler(struct ctx *c, const struct timespec *now); void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) __attribute__((format(printf, 3, 4))); diff --git a/passt.c b/passt.c index 5f72a28..870064f 100644 --- a/passt.c +++ b/passt.c @@ -53,8 +53,9 @@ #define EPOLL_EVENTS 8 -#define __TIMER_INTERVAL MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL) -#define TIMER_INTERVAL MIN(__TIMER_INTERVAL, ICMP_TIMER_INTERVAL) +#define TIMER_INTERVAL__ MIN(TCP_TIMER_INTERVAL, UDP_TIMER_INTERVAL) +#define TIMER_INTERVAL_ MIN(TIMER_INTERVAL__, ICMP_TIMER_INTERVAL) +#define TIMER_INTERVAL MIN(TIMER_INTERVAL_, FLOW_TIMER_INTERVAL) char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE))); @@ -103,7 +104,7 @@ static void post_handler(struct ctx *c, const struct timespec *now) /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, icmp, ICMP); - flow_defer_handler(c); + flow_defer_handler(c, now); #undef CALL_PROTO_HANDLER } diff --git a/tcp.c b/tcp.c index 7065531..9fffafb 100644 --- a/tcp.c +++ b/tcp.c @@ -3171,8 +3171,6 @@ static int tcp_port_rebind_outbound(void *arg) */ void tcp_timer(struct ctx *c, const struct timespec *now) { - union flow *flow; - (void)now; if (c->mode == MODE_PASTA) { @@ -3187,10 +3185,6 @@ void tcp_timer(struct ctx *c, const struct timespec *now) } } - for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) - if (flow->f.type == FLOW_TCP_SPLICE) - tcp_splice_timer(c, flow); - tcp_sock_refill_init(c); if (c->mode == MODE_PASTA) tcp_splice_refill(c); -- 2.43.0
As we already did for flow types, use an "EPOLL_NUM_TYPES" isntead of EPOLL_TYPE_MAX, which is a little bit safer and clearer. Add a static assert on the size of the matching names array. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- passt.c | 4 +++- passt.h | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/passt.c b/passt.c index 870064f..a37a2f4 100644 --- a/passt.c +++ b/passt.c @@ -59,7 +59,7 @@ char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE))); -char *epoll_type_str[EPOLL_TYPE_MAX + 1] = { +char *epoll_type_str[] = { [EPOLL_TYPE_TCP] = "connected TCP socket", [EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket", [EPOLL_TYPE_TCP_TIMER] = "TCP timer", @@ -71,6 +71,8 @@ char *epoll_type_str[EPOLL_TYPE_MAX + 1] = { [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", }; +static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, + "epoll_type_str[] doesn't match enum epoll_type"); /** * post_handler() - Run periodic and deferred tasks for L4 protocol handlers diff --git a/passt.h b/passt.h index c74887a..f54023a 100644 --- a/passt.h +++ b/passt.h @@ -70,7 +70,7 @@ enum epoll_type { /* socket listening for qemu socket connections */ EPOLL_TYPE_TAP_LISTEN, - EPOLL_TYPE_MAX = EPOLL_TYPE_TAP_LISTEN, + EPOLL_NUM_TYPES, }; /** @@ -115,7 +115,7 @@ extern char pkt_buf [PKT_BUF_BYTES]; extern char *epoll_type_str[]; #define EPOLL_TYPE_STR(n) \ - (((uint8_t)(n) <= EPOLL_TYPE_MAX && epoll_type_str[(n)]) ? \ + (((uint8_t)(n) < EPOLL_NUM_TYPES && epoll_type_str[(n)]) ? \ epoll_type_str[(n)] : "?") #include <resolv.h> /* For MAXNS below */ -- 2.43.0
Currently connected TCP sockets have the same epoll type, whether they're for a "tap" connection or a spliced connection. This means that tcp_sock_handler() has to do a secondary check on the type of the connection to call the right function. We can avoid this by adding a new epoll type and dispatching directly to the right thing. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- passt.c | 8 ++++++-- passt.h | 2 ++ tcp.c | 36 ++++++++---------------------------- tcp_splice.c | 16 +++++++++------- tcp_splice.h | 4 ++-- 5 files changed, 27 insertions(+), 39 deletions(-) diff --git a/passt.c b/passt.c index a37a2f4..71bea8f 100644 --- a/passt.c +++ b/passt.c @@ -50,6 +50,7 @@ #include "pasta.h" #include "arch.h" #include "log.h" +#include "tcp_splice.h" #define EPOLL_EVENTS 8 @@ -61,6 +62,7 @@ char pkt_buf[PKT_BUF_BYTES] __attribute__ ((aligned(PAGE_SIZE))); char *epoll_type_str[] = { [EPOLL_TYPE_TCP] = "connected TCP socket", + [EPOLL_TYPE_TCP_SPLICE] = "connected spliced TCP socket", [EPOLL_TYPE_TCP_LISTEN] = "listening TCP socket", [EPOLL_TYPE_TCP_TIMER] = "TCP timer", [EPOLL_TYPE_UDP] = "UDP socket", @@ -373,8 +375,10 @@ loop: pasta_netns_quit_handler(&c, quit_fd); break; case EPOLL_TYPE_TCP: - if (!c.no_tcp) - tcp_sock_handler(&c, ref, eventmask); + tcp_sock_handler(&c, ref, eventmask); + break; + case EPOLL_TYPE_TCP_SPLICE: + tcp_splice_sock_handler(&c, ref, eventmask); break; case EPOLL_TYPE_TCP_LISTEN: tcp_listen_handler(&c, ref, &now); diff --git a/passt.h b/passt.h index f54023a..82b0fcf 100644 --- a/passt.h +++ b/passt.h @@ -51,6 +51,8 @@ enum epoll_type { EPOLL_TYPE_NONE = 0, /* Connected TCP sockets */ EPOLL_TYPE_TCP, + /* Connected TCP sockets (spliced) */ + EPOLL_TYPE_TCP_SPLICE, /* Listening TCP sockets */ EPOLL_TYPE_TCP_LISTEN, /* timerfds used for TCP timers */ diff --git a/tcp.c b/tcp.c index 9fffafb..e7d11ee 100644 --- a/tcp.c +++ b/tcp.c @@ -2804,14 +2804,18 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) } /** - * tcp_tap_sock_handler() - Handle new data from non-spliced socket + * tcp_sock_handler() - Handle new data from non-spliced socket * @c: Execution context - * @conn: Connection state + * @ref: epoll reference * @events: epoll events bitmap */ -static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, - uint32_t events) +void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) { + struct tcp_tap_conn *conn = CONN(ref.flowside.flow); + + ASSERT(conn->f.type == FLOW_TCP); + ASSERT(ref.flowside.side == SOCKSIDE); + if (conn->events == CLOSED) return; @@ -2858,30 +2862,6 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, } } -/** - * tcp_sock_handler() - Handle new data from socket, or timerfd event - * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap - */ -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) -{ - union flow *flow = FLOW(ref.flowside.flow); - - switch (flow->f.type) { - case FLOW_TCP: - tcp_tap_sock_handler(c, &flow->tcp, events); - break; - case FLOW_TCP_SPLICE: - tcp_splice_sock_handler(c, &flow->tcp_splice, ref.flowside.side, - events); - break; - default: - die("Unexpected %s in tcp_sock_handler_compact()", - FLOW_TYPE(&flow->f)); - } -} - /** * tcp_sock_init_af() - Initialise listening socket for a given af and port * @c: Execution context diff --git a/tcp_splice.c b/tcp_splice.c index a7d577a..33bbef1 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -127,9 +127,9 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; const union epoll_ref ref[SIDES] = { - { .type = EPOLL_TYPE_TCP, .fd = conn->s[0], + { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[0], .flowside = FLOW_SIDX(conn, 0) }, - { .type = EPOLL_TYPE_TCP, .fd = conn->s[1], + { .type = EPOLL_TYPE_TCP_SPLICE, .fd = conn->s[1], .flowside = FLOW_SIDX(conn, 1) } }; struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, @@ -484,18 +484,20 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, /** * tcp_splice_sock_handler() - Handler for socket mapped to spliced connection * @c: Execution context - * @conn: Connection state - * @side: Side of the connection on which an event has occurred + * @ref: epoll reference * @events: epoll events bitmap * * #syscalls:pasta splice */ -void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, - int side, uint32_t events) +void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, + uint32_t events) { + struct tcp_splice_conn *conn = CONN(ref.flowside.flow); + unsigned side = ref.flowside.side, fromside; uint8_t lowat_set_flag, lowat_act_flag; int eof, never_read; - unsigned fromside; + + ASSERT(conn->f.type == FLOW_TCP_SPLICE); if (conn->events == SPLICE_CLOSED) return; diff --git a/tcp_splice.h b/tcp_splice.h index aa85c7c..18193e4 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -8,8 +8,8 @@ struct tcp_splice_conn; -void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, - int side, uint32_t events); +void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, + uint32_t events); bool tcp_splice_conn_from_sock(const struct ctx *c, union tcp_listen_epoll_ref ref, struct tcp_splice_conn *conn, int s, -- 2.43.0
flow_log_() is a very basic widely used function that many other functions in flow.c will end up needing. At present it's below flow_table_compact() which happens not to need it, but that's likely to change. Move it to near the top of flow.c to avoid forward declarations. Code motion only, no changes. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/flow.c b/flow.c index 5dd5d2b..a710e50 100644 --- a/flow.c +++ b/flow.c @@ -31,6 +31,24 @@ union flow flowtab[FLOW_MAX]; /* Last time the flow timers ran */ static struct timespec flow_timer_run; +/** flow_log_ - Log flow-related message + * @f: flow the message is related to + * @pri: Log priority + * @fmt: Format string + * @...: printf-arguments + */ +void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) +{ + char msg[BUFSIZ]; + va_list args; + + va_start(args, fmt); + (void)vsnprintf(msg, sizeof(msg), fmt, args); + va_end(args); + + logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); +} + /** * flow_table_compact() - Perform compaction on flow table * @c: Execution context @@ -70,24 +88,6 @@ void flow_table_compact(struct ctx *c, union flow *hole) memset(from, 0, sizeof(*from)); } -/** flow_log_ - Log flow-related message - * @f: flow the message is related to - * @pri: Log priority - * @fmt: Format string - * @...: printf-arguments - */ -void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) -{ - char msg[BUFSIZ]; - va_list args; - - va_start(args, fmt); - (void)vsnprintf(msg, sizeof(msg), fmt, args); - va_end(args); - - logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); -} - /** * flow_defer_handler() - Handler for per-flow deferred and timed tasks * @c: Execution context -- 2.43.0
In general, the passt code is a bit haphazard about what's a true global variable and what's in the quasi-global 'context structure'. The flow_count field is one such example: it's in the context structure, although it's really part of the same data structure as flowtab[], which is a genuine global. Move flow_count to be a regular global to match. For now it needs to be public, rather than static, but we expect to be able to change that in future. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 11 ++++++----- flow.h | 4 ++-- flow_table.h | 1 + passt.h | 3 --- tcp.c | 10 +++++----- tcp_conn.h | 4 ++-- tcp_splice.c | 2 +- 7 files changed, 17 insertions(+), 18 deletions(-) diff --git a/flow.c b/flow.c index a710e50..64de75c 100644 --- a/flow.c +++ b/flow.c @@ -26,6 +26,7 @@ static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, "flow_type_str[] doesn't match enum flow_type"); /* Global Flow Table */ +unsigned flow_count; union flow flowtab[FLOW_MAX]; /* Last time the flow timers ran */ @@ -54,18 +55,18 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) * @c: Execution context * @hole: Pointer to recently closed flow */ -void flow_table_compact(struct ctx *c, union flow *hole) +void flow_table_compact(const struct ctx *c, union flow *hole) { union flow *from; - if (FLOW_IDX(hole) == --c->flow_count) { + if (FLOW_IDX(hole) == --flow_count) { debug("flow: table compaction: maximum index was %u (%p)", FLOW_IDX(hole), (void *)hole); memset(hole, 0, sizeof(*hole)); return; } - from = flowtab + c->flow_count; + from = flowtab + flow_count; memcpy(hole, from, sizeof(*hole)); switch (from->f.type) { @@ -93,7 +94,7 @@ void flow_table_compact(struct ctx *c, union flow *hole) * @c: Execution context * @now: Current timestamp */ -void flow_defer_handler(struct ctx *c, const struct timespec *now) +void flow_defer_handler(const struct ctx *c, const struct timespec *now) { bool timer = false; union flow *flow; @@ -103,7 +104,7 @@ void flow_defer_handler(struct ctx *c, const struct timespec *now) flow_timer_run = *now; } - for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { + for (flow = flowtab + flow_count - 1; flow >= flowtab; flow--) { switch (flow->f.type) { case FLOW_TCP: tcp_flow_defer(c, flow); diff --git a/flow.h b/flow.h index 423e685..44058bf 100644 --- a/flow.h +++ b/flow.h @@ -68,8 +68,8 @@ static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) union flow; -void flow_table_compact(struct ctx *c, union flow *hole); -void flow_defer_handler(struct ctx *c, const struct timespec *now); +void flow_table_compact(const struct ctx *c, union flow *hole); +void flow_defer_handler(const struct ctx *c, const struct timespec *now); void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) __attribute__((format(printf, 3, 4))); diff --git a/flow_table.h b/flow_table.h index e805f10..4aa2398 100644 --- a/flow_table.h +++ b/flow_table.h @@ -22,6 +22,7 @@ union flow { }; /* Global Flow Table */ +extern unsigned flow_count; extern union flow flowtab[]; diff --git a/passt.h b/passt.h index 82b0fcf..a9e8f15 100644 --- a/passt.h +++ b/passt.h @@ -224,7 +224,6 @@ struct ip6_ctx { * @pasta_conf_ns: Configure namespace after creating it * @no_copy_routes: Don't copy all routes when configuring target namespace * @no_copy_addrs: Don't copy all addresses when configuring namespace - * @flow_count: Number of tracked packet flows (connections etc.) * @no_tcp: Disable TCP operation * @tcp: Context for TCP protocol handler * @no_tcp: Disable UDP operation @@ -284,8 +283,6 @@ struct ctx { int no_copy_routes; int no_copy_addrs; - unsigned flow_count; - int no_tcp; struct tcp_ctx tcp; int no_udp; diff --git a/tcp.c b/tcp.c index e7d11ee..6b62896 100644 --- a/tcp.c +++ b/tcp.c @@ -1307,7 +1307,7 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, * @c: Execution context * @flow: Flow table entry for this connection */ -void tcp_flow_defer(struct ctx *c, union flow *flow) +void tcp_flow_defer(const struct ctx *c, union flow *flow) { const struct tcp_tap_conn *conn = &flow->tcp; @@ -1949,7 +1949,7 @@ static void tcp_conn_from_tap(struct ctx *c, (void)saddr; - if (c->flow_count >= FLOW_MAX) + if (flow_count >= FLOW_MAX) return; if ((s = tcp_conn_pool_sock(pool)) < 0) @@ -1975,7 +1975,7 @@ static void tcp_conn_from_tap(struct ctx *c, } } - conn = CONN(c->flow_count++); + conn = CONN(flow_count++); conn->f.type = FLOW_TCP; conn->sock = s; conn->timer = -1; @@ -2724,14 +2724,14 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, union flow *flow; int s; - if (c->no_tcp || c->flow_count >= FLOW_MAX) + if (c->no_tcp || flow_count >= FLOW_MAX) return; s = accept4(ref.fd, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); if (s < 0) return; - flow = flowtab + c->flow_count++; + flow = flowtab + flow_count++; if (c->mode == MODE_PASTA && tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, diff --git a/tcp_conn.h b/tcp_conn.h index 72b9ecb..825155a 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -158,8 +158,8 @@ extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, struct tcp_tap_conn *new); void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); -void tcp_flow_defer(struct ctx *c, union flow *flow); -void tcp_splice_flow_defer(struct ctx *c, union flow *flow); +void tcp_flow_defer(const struct ctx *c, union flow *flow); +void tcp_splice_flow_defer(const struct ctx *c, union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); diff --git a/tcp_splice.c b/tcp_splice.c index 33bbef1..3f6f1b3 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -247,7 +247,7 @@ void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new) * @c: Execution context * @flow: Flow table entry for this connection */ -void tcp_splice_flow_defer(struct ctx *c, union flow *flow) +void tcp_splice_flow_defer(const struct ctx *c, union flow *flow) { struct tcp_splice_conn *conn = &flow->tcp_splice; unsigned side; -- 2.43.0
Currently tcp.c open codes the process of allocating a new flow from the flow table: twice, in fact, once for guest to host and once for host to guest connections. This duplication isn't ideal and will get worse as we add more protocols to the flow table. It also makes it harder to experiment with different ways of handling flow table allocation. Instead, introduce a function to allocate a new flow: flow_alloc(). In some cases we currently check if we're able to allocate, but delay the actual allocation. We now handle that slightly differently with a flow_alloc_cancel() function to back out a recent allocation. We have that separate from a flow_free() function, because future changes we have in mind will need to handle this case a little differently. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 26 ++++++++++++++++++++++++++ flow_table.h | 3 +++ tcp.c | 29 ++++++++++++++++++----------- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/flow.c b/flow.c index 64de75c..63eefd6 100644 --- a/flow.c +++ b/flow.c @@ -50,6 +50,32 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); } +/** + * flow_alloc() - Allocate a new flow + * + * Return: pointer to an unused flow entry, or NULL if the table is full + */ +union flow *flow_alloc(void) +{ + if (flow_count >= FLOW_MAX) + return NULL; + + return &flowtab[flow_count++]; +} + +/** + * flow_alloc_cancel() - Free a newly allocated flow + * @flow: Flow to deallocate + * + * @flow must be the last flow allocated by flow_alloc() + */ +void flow_alloc_cancel(union flow *flow) +{ + ASSERT(FLOW_IDX(flow) == flow_count - 1); + memset(flow, 0, sizeof(*flow)); + flow_count--; +} + /** * flow_table_compact() - Perform compaction on flow table * @c: Execution context diff --git a/flow_table.h b/flow_table.h index 4aa2398..2773a2b 100644 --- a/flow_table.h +++ b/flow_table.h @@ -88,4 +88,7 @@ static inline flow_sidx_t flow_sidx(const struct flow_common *f, */ #define FLOW_SIDX(f_, side) (flow_sidx(&(f_)->f, (side))) +union flow *flow_alloc(void); +void flow_alloc_cancel(union flow *flow); + #endif /* FLOW_TABLE_H */ diff --git a/tcp.c b/tcp.c index 6b62896..5b56786 100644 --- a/tcp.c +++ b/tcp.c @@ -1944,17 +1944,18 @@ static void tcp_conn_from_tap(struct ctx *c, }; const struct sockaddr *sa; struct tcp_tap_conn *conn; + union flow *flow; socklen_t sl; int s, mss; (void)saddr; - if (flow_count >= FLOW_MAX) + if (!(flow = flow_alloc())) return; if ((s = tcp_conn_pool_sock(pool)) < 0) if ((s = tcp_conn_new_sock(c, af)) < 0) - return; + goto cancel; if (!c->no_map_gw) { if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw)) @@ -1969,13 +1970,11 @@ static void tcp_conn_from_tap(struct ctx *c, .sin6_addr = c->ip6.addr_ll, .sin6_scope_id = c->ifi6, }; - if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) { - close(s); - return; - } + if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) + goto cancel; } - conn = CONN(flow_count++); + conn = &flow->tcp; conn->f.type = FLOW_TCP; conn->sock = s; conn->timer = -1; @@ -2047,6 +2046,12 @@ static void tcp_conn_from_tap(struct ctx *c, } tcp_epoll_ctl(c, conn); + return; + +cancel: + if (s >= 0) + close(s); + flow_alloc_cancel(flow); } /** @@ -2724,14 +2729,12 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, union flow *flow; int s; - if (c->no_tcp || flow_count >= FLOW_MAX) + if (c->no_tcp || !(flow = flow_alloc())) return; s = accept4(ref.fd, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); if (s < 0) - return; - - flow = flowtab + flow_count++; + goto cancel; if (c->mode == MODE_PASTA && tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, @@ -2740,6 +2743,10 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, (struct sockaddr *)&sa, now); + return; + +cancel: + flow_alloc_cancel(flow); } /** -- 2.43.0
Currently, flows are only evern finally freed (and the table compacted) from the deferred handlers. Some future ways we want to optimise managing the flow table will rely on this, so enforce it: rather than having the TCP code directly call flow_table_compact(), add a boolean return value to the per-flow deferred handlers. If true, this indicates that the flow code itself should free the flow. This forces all freeing of flows to occur during the flow code's scan of the table in flow_defer_handler() which opens possibilities for future optimisations. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 13 +++++++++---- flow.h | 1 - tcp.c | 9 +++++---- tcp_conn.h | 4 ++-- tcp_splice.c | 9 +++++---- 5 files changed, 21 insertions(+), 15 deletions(-) diff --git a/flow.c b/flow.c index 63eefd6..d6650fc 100644 --- a/flow.c +++ b/flow.c @@ -81,7 +81,7 @@ void flow_alloc_cancel(union flow *flow) * @c: Execution context * @hole: Pointer to recently closed flow */ -void flow_table_compact(const struct ctx *c, union flow *hole) +static void flow_table_compact(const struct ctx *c, union flow *hole) { union flow *from; @@ -131,18 +131,23 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) } for (flow = flowtab + flow_count - 1; flow >= flowtab; flow--) { + bool closed = false; + switch (flow->f.type) { case FLOW_TCP: - tcp_flow_defer(c, flow); + closed = tcp_flow_defer(flow); break; case FLOW_TCP_SPLICE: - tcp_splice_flow_defer(c, flow); - if (timer) + closed = tcp_splice_flow_defer(flow); + if (!closed && timer) tcp_splice_timer(c, flow); break; default: /* Assume other flow types don't need any handling */ ; } + + if (closed) + flow_table_compact(c, flow); } } diff --git a/flow.h b/flow.h index 44058bf..8064f0e 100644 --- a/flow.h +++ b/flow.h @@ -68,7 +68,6 @@ static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) union flow; -void flow_table_compact(const struct ctx *c, union flow *hole); void flow_defer_handler(const struct ctx *c, const struct timespec *now); void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) diff --git a/tcp.c b/tcp.c index 5b56786..ee2c3af 100644 --- a/tcp.c +++ b/tcp.c @@ -1304,21 +1304,22 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, /** * tcp_flow_defer() - Deferred per-flow handling (clean up closed connections) - * @c: Execution context * @flow: Flow table entry for this connection + * + * Return: true if the flow is ready to free, false otherwise */ -void tcp_flow_defer(const struct ctx *c, union flow *flow) +bool tcp_flow_defer(union flow *flow) { const struct tcp_tap_conn *conn = &flow->tcp; if (flow->tcp.events != CLOSED) - return; + return false; close(conn->sock); if (conn->timer != -1) close(conn->timer); - flow_table_compact(c, flow); + return true; } static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); diff --git a/tcp_conn.h b/tcp_conn.h index 825155a..636224e 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -158,8 +158,8 @@ extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, struct tcp_tap_conn *new); void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); -void tcp_flow_defer(const struct ctx *c, union flow *flow); -void tcp_splice_flow_defer(const struct ctx *c, union flow *flow); +bool tcp_flow_defer(union flow *flow); +bool tcp_splice_flow_defer(union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); diff --git a/tcp_splice.c b/tcp_splice.c index 3f6f1b3..daef7de 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -244,16 +244,17 @@ void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new) /** * tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed) - * @c: Execution context * @flow: Flow table entry for this connection + * + * Return: true if the flow is ready to free, false otherwise */ -void tcp_splice_flow_defer(const struct ctx *c, union flow *flow) +bool tcp_splice_flow_defer(union flow *flow) { struct tcp_splice_conn *conn = &flow->tcp_splice; unsigned side; if (!(flow->tcp_splice.flags & CLOSING)) - return; + return false; for (side = 0; side < SIDES; side++) { if (conn->events & SPLICE_ESTABLISHED) { @@ -277,7 +278,7 @@ void tcp_splice_flow_defer(const struct ctx *c, union flow *flow) conn->flags = 0; flow_dbg(conn, "CLOSED"); - flow_table_compact(c, flow); + return true; } /** -- 2.43.0
Currently we always keep the flow table maximally compact: that is all the active entries are contiguous at the start of the table. Doing this sometimes requires moving an entry when one is freed. That's kind of fiddly, and potentially expensive: it requires updating the hash table for the new location, and depending on flow type, it may require EPOLL_CTL_MOD, system calls to update epoll tags with the new location too. Implement a new way of managing the flow table that doesn't ever move entries. It attempts to maintain some compactness by always using the first free slot for a new connection, and mitigates the effect of non compactness by cheaply skipping over contiguous blocks of free entries. See the "theory of operation" comment in flow.c for details. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au>b --- flow.c | 195 ++++++++++++++++++++++++++++++++++++++------------- flow.h | 1 + flow_table.h | 16 ++++- passt.c | 2 + tcp.c | 23 ------ tcp_conn.h | 3 - tcp_splice.c | 11 --- 7 files changed, 164 insertions(+), 87 deletions(-) diff --git a/flow.c b/flow.c index d6650fc..30c539e 100644 --- a/flow.c +++ b/flow.c @@ -26,7 +26,59 @@ static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, "flow_type_str[] doesn't match enum flow_type"); /* Global Flow Table */ -unsigned flow_count; + +/** + * DOC: Theory of Operation - allocating and freeing flow entries + * + * Flows are entries in flowtab[]. We need to routinely scan the whole table to + * perform deferred bookkeeping tasks on active entries, and sparse empty slots + * waste time and worsen data locality. But, keeping the table fully compact by + * moving entries on deletion is fiddly: it requires updating hash tables, and + * the epoll references to flows. Instead, we implement the compromise described + * below. + * + * Free clusters + * A "free cluster" is a contiguous set of unused (FLOW_TYPE_NONE) entries in + * flowtab[]. The first entry in each cluster contains metadata ('free' + * field in union flow), specifically the number of entries in the cluster + * (free.n), and the index of the next free cluster (free.next). The entries + * in the cluster other than the first should have n == next == 0. + * + * Free cluster list + * flow_first_free gives the index of the first (lowest index) free cluster. + * Each free cluster has the index of the next free cluster, or MAX_FLOW if + * it is the last free cluster. Together these form a linked list of free + * clusters, in strictly increasing order of index. + * + * Allocating + * We always allocate a new flow into the lowest available index, i.e. the + * first entry of the first free cluster, that is, at index flow_first_free. + * We update flow_first_free and the free cluster to maintain the invariants + * above (so the free cluster list is still in strictly increasing order). + * + * Freeing + * It's not possible to maintain the invariants above if we allow freeing of + * any entry at any time. So we only allow freeing in two cases. + * + * 1) flow_alloc_cancel() will free the most recent allocation. We can + * maintain the invariants because we know that allocation was made in the + * lowest available slot, and so will become the lowest index free slot again + * after cancellation. + * + * 2) Flows can be freed by returning true from the flow type specific + * deferred or timer function. These are called from flow_defer_handler() + * which is already scanning the whole table in index order. We can use that + * to rebuild the free cluster list correctly, either merging them into + * existing free clusters or creating new free clusters in the list for them. + * + * Scanning the table + * Theoretically, scanning the table requires FLOW_MAX iterations. However, + * when we encounter the start of a free cluster, we can immediately skip + * past it, meaning that in practice we only need (number of active + * connections) + (number of free clusters) iterations. + */ + +unsigned flow_first_free; union flow flowtab[FLOW_MAX]; /* Last time the flow timers ran */ @@ -57,10 +109,32 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) */ union flow *flow_alloc(void) { - if (flow_count >= FLOW_MAX) + union flow *flow = &flowtab[flow_first_free]; + + if (flow_first_free >= FLOW_MAX) return NULL; - return &flowtab[flow_count++]; + ASSERT(flow->f.type == FLOW_TYPE_NONE); + ASSERT(flow->free.n >= 1); + ASSERT(flow_first_free + flow->free.n <= FLOW_MAX); + + if (flow->free.n > 1) { + /* Use one entry from the cluster */ + union flow *next = &flowtab[++flow_first_free]; + + ASSERT(FLOW_IDX(next) < FLOW_MAX); + ASSERT(next->f.type == FLOW_TYPE_NONE); + ASSERT(next->free.n == 0); + + next->free.n = flow->free.n - 1; + next->free.next = flow->free.next; + } else { + /* Use the entire cluster */ + flow_first_free = flow->free.next; + } + + memset(flow, 0, sizeof(*flow)); + return flow; } /** @@ -71,48 +145,15 @@ union flow *flow_alloc(void) */ void flow_alloc_cancel(union flow *flow) { - ASSERT(FLOW_IDX(flow) == flow_count - 1); - memset(flow, 0, sizeof(*flow)); - flow_count--; -} - -/** - * flow_table_compact() - Perform compaction on flow table - * @c: Execution context - * @hole: Pointer to recently closed flow - */ -static void flow_table_compact(const struct ctx *c, union flow *hole) -{ - union flow *from; - - if (FLOW_IDX(hole) == --flow_count) { - debug("flow: table compaction: maximum index was %u (%p)", - FLOW_IDX(hole), (void *)hole); - memset(hole, 0, sizeof(*hole)); - return; - } - - from = flowtab + flow_count; - memcpy(hole, from, sizeof(*hole)); - - switch (from->f.type) { - case FLOW_TCP: - tcp_tap_conn_update(c, &from->tcp, &hole->tcp); - break; - case FLOW_TCP_SPLICE: - tcp_splice_conn_update(c, &hole->tcp_splice); - break; - default: - die("Unexpected %s in tcp_table_compact()", - FLOW_TYPE(&from->f)); - } - - debug("flow: table compaction (%s): old index %u, new index %u, " - "from: %p, to: %p", - FLOW_TYPE(&from->f), FLOW_IDX(from), FLOW_IDX(hole), - (void *)from, (void *)hole); - - memset(from, 0, sizeof(*from)); + ASSERT(flow_first_free > FLOW_IDX(flow)); + + flow->f.type = FLOW_TYPE_NONE; + /* Put it back in a length 1 free cluster, don't attempt to fully + * reverse flow_alloc()s steps. This will get folded together the next + * time flow_defer_handler runs anyway() */ + flow->free.n = 1; + flow->free.next = flow_first_free; + flow_first_free = FLOW_IDX(flow); } /** @@ -122,18 +163,46 @@ static void flow_table_compact(const struct ctx *c, union flow *hole) */ void flow_defer_handler(const struct ctx *c, const struct timespec *now) { + struct flow_free_cluster *free_head = NULL; + unsigned *last_next = &flow_first_free; bool timer = false; - union flow *flow; + unsigned idx; if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) { timer = true; flow_timer_run = *now; } - for (flow = flowtab + flow_count - 1; flow >= flowtab; flow--) { + for (idx = 0; idx < FLOW_MAX; idx++) { + union flow *flow = &flowtab[idx]; bool closed = false; + if (flow->f.type == FLOW_TYPE_NONE) { + unsigned skip = flow->free.n; + + /* First entry of a free cluster must have n >= 1 */ + ASSERT(skip); + + if (free_head) { + /* Merge into preceding free cluster */ + free_head->n += flow->free.n; + flow->free.n = flow->free.next = 0; + } else { + /* New free cluster, add to chain */ + free_head = &flow->free; + *last_next = idx; + last_next = &free_head->next; + } + + /* Skip remaining empty entries */ + idx += skip - 1; + continue; + } + switch (flow->f.type) { + case FLOW_TYPE_NONE: + ASSERT(false); + break; case FLOW_TCP: closed = tcp_flow_defer(flow); break; @@ -147,7 +216,35 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) ; } - if (closed) - flow_table_compact(c, flow); + if (closed) { + flow->f.type = FLOW_TYPE_NONE; + + if (free_head) { + /* Add slot to current free cluster */ + ASSERT(idx == FLOW_IDX(free_head) + free_head->n); + free_head->n++; + flow->free.n = flow->free.next = 0; + } else { + /* Create new free cluster */ + free_head = &flow->free; + free_head->n = 1; + *last_next = idx; + last_next = &free_head->next; + } + } else { + free_head = NULL; + } } + + *last_next = FLOW_MAX; +} + +/** + * flow_init() - Initialise flow related data structures + */ +void flow_init(void) +{ + /* Initial state is a single free cluster containing the whole table */ + flowtab[0].free.n = FLOW_MAX; + flowtab[0].free.next = FLOW_MAX; } diff --git a/flow.h b/flow.h index 8064f0e..48a0ab4 100644 --- a/flow.h +++ b/flow.h @@ -68,6 +68,7 @@ static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) union flow; +void flow_init(void); void flow_defer_handler(const struct ctx *c, const struct timespec *now); void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) diff --git a/flow_table.h b/flow_table.h index 2773a2b..eecf884 100644 --- a/flow_table.h +++ b/flow_table.h @@ -9,6 +9,19 @@ #include "tcp_conn.h" +/** + * struct flow_free_cluster - Information about a cluster of free entries + * @f: Generic flow information + * @n: Number of entries in the free cluster (including this one) + * @next: Index of next free cluster + */ +struct flow_free_cluster { + /* Must be first element */ + struct flow_common f; + unsigned n; + unsigned next; +}; + /** * union flow - Descriptor for a logical packet flow (e.g. connection) * @f: Fields common between all variants @@ -17,12 +30,13 @@ */ union flow { struct flow_common f; + struct flow_free_cluster free; struct tcp_tap_conn tcp; struct tcp_splice_conn tcp_splice; }; /* Global Flow Table */ -extern unsigned flow_count; +extern unsigned flow_first_free; extern union flow flowtab[]; diff --git a/passt.c b/passt.c index 71bea8f..d315438 100644 --- a/passt.c +++ b/passt.c @@ -285,6 +285,8 @@ int main(int argc, char **argv) clock_gettime(CLOCK_MONOTONIC, &now); + flow_init(); + if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c))) exit(EXIT_FAILURE); diff --git a/tcp.c b/tcp.c index ee2c3af..905d26f 100644 --- a/tcp.c +++ b/tcp.c @@ -1251,29 +1251,6 @@ static void tcp_hash_remove(const struct ctx *c, tc_hash[b] = FLOW_SIDX_NONE; } -/** - * tcp_tap_conn_update() - Update tcp_tap_conn when being moved in the table - * @c: Execution context - * @old: Old location of tcp_tap_conn - * @new: New location of tcp_tap_conn - */ -void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, - struct tcp_tap_conn *new) - -{ - unsigned b = tcp_hash_probe(c, old); - - if (!flow_at_sidx(tc_hash[b])) - return; /* Not in hash table, nothing to update */ - - tc_hash[b] = FLOW_SIDX(new, TAPSIDE); - - debug("TCP: hash table update: old index %u, new index %u, sock %i, " - "bucket: %u", FLOW_IDX(old), FLOW_IDX(new), new->sock, b); - - tcp_epoll_ctl(c, new); -} - /** * tcp_hash_lookup() - Look up connection given remote address and ports * @c: Execution context diff --git a/tcp_conn.h b/tcp_conn.h index 636224e..a5f5cfe 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -155,9 +155,6 @@ struct tcp_splice_conn { extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; -void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, - struct tcp_tap_conn *new); -void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); bool tcp_flow_defer(union flow *flow); bool tcp_splice_flow_defer(union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); diff --git a/tcp_splice.c b/tcp_splice.c index daef7de..26d3206 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -231,17 +231,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, } while (0) -/** - * tcp_splice_conn_update() - Update tcp_splice_conn when being moved in the table - * @c: Execution context - * @new: New location of tcp_splice_conn - */ -void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new) -{ - if (tcp_splice_epoll_ctl(c, new)) - conn_flag(c, new, CLOSING); -} - /** * tcp_splice_flow_defer() - Deferred per-flow handling (clean up closed) * @flow: Flow table entry for this connection -- 2.43.0
On Tue, 16 Jan 2024 11:50:30 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:There are a number of things that are more-or-less general to flows which are still explicitly handled in tcp.c and tcp_splice.c including allocation and freeing of flow entries, and dispatch of deferred and timer functions. Even without adding more fields to the common flow structure, we can handle a number of these in a more flow-centric way. Unlike v1 this version is based on the hash table rework series.Applied. -- Stefano