[PATCH v2 00/11] Introduce unified flow table, first steps

David Gibson

27 Nov 2023 27 Nov '23

12:33 a.m.

Here's my latest revision of some of the basics of the flow table. So far it's basically just a renaming of the existing TCP connection table, along with some associated helpers. It's used for some new logging infrastructure, but otherwise doesn't really function any differently. However, this subset of the flow table work no longer bloats flow/connection entries over a single cache line. That removes the most prominent drawback of earlier revisions, meaning I think this series is ready for merge now. Doing so will mean the later series making more substantive changes to the flow behaviour are simpler. Tested on top of the patch updating shell prompt escape handling, but should be independent of it. Changes since v1: * Removed a inaccurate stale comment * Added doc comment to FLOW() macro * Added new patches cleaning up signedness of 'side' variables * Added new patches introducing "sidx"s (flow+side indices) David Gibson (11): flow, tcp: Generalise connection types flow, tcp: Move TCP connection table to unified flow table flow, tcp: Consolidate flow pointer<->index helpers util: MAX_FROM_BITS() should be unsigned flow: Make unified version of flow table compaction flow, tcp: Add logging helpers for connection related messages flow: Introduce 'sidx' type to represent one side of one flow tcp: Remove unneccessary bounds check in tcp_timer_handler() flow,tcp: Generalise TCP epoll_ref to generic flows tcp_splice: Use unsigned to represent side flow,tcp: Use epoll_ref type including flow and side Makefile | 14 +-- flow.c | 84 ++++++++++++++++++ flow.h | 73 ++++++++++++++++ flow_table.h | 86 ++++++++++++++++++ passt.h | 13 ++- tcp.c | 243 ++++++++++++++++++++++++--------------------------- tcp.h | 5 -- tcp_conn.h | 46 +++------- tcp_splice.c | 128 ++++++++++++--------------- tcp_splice.h | 2 +- util.h | 2 +- 11 files changed, 440 insertions(+), 256 deletions(-) create mode 100644 flow.c create mode 100644 flow.h create mode 100644 flow_table.h -- 2.43.0

Show replies by date

David Gibson

27 Nov 27 Nov

12:33 a.m.

New subject: [PATCH v2 01/11] flow, tcp: Generalise connection types

Currently TCP connections use a 1-bit selector, 'spliced', to determine the rest of the contents of the structure. We want to generalise the TCP connection table to other types of flows in other protocols. Make a start on this by replacing the tcp_conn_common structure with a new flow_common structure with an enum rather than a simple boolean indicating the type of flow. Signed-off-by: David Gibson --- Makefile | 8 +++---- flow.c | 16 +++++++++++++ flow.h | 36 ++++++++++++++++++++++++++++++ tcp.c | 63 +++++++++++++++++++++++++++++++++++++--------------- tcp_conn.h | 24 ++++++-------------- tcp_splice.c | 3 ++- 6 files changed, 110 insertions(+), 40 deletions(-) create mode 100644 flow.c create mode 100644 flow.h diff --git a/Makefile b/Makefile index 6c53695..e2970e3 100644 --- a/Makefile +++ b/Makefile @@ -44,15 +44,15 @@ FLAGS += -DARCH=\"$(TARGET_ARCH)\" FLAGS += -DVERSION=\"$(VERSION)\" FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) -PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c icmp.c igmp.c \ - isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c passt.c \ - pasta.c pcap.c port_fwd.c tap.c tcp.c tcp_splice.c udp.c util.c +PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c icmp.c \ + igmp.c isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c \ + passt.c pasta.c pcap.c port_fwd.c tap.c tcp.c tcp_splice.c udp.c util.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) MANPAGES = passt.1 pasta.1 qrap.1 -PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h icmp.h \ +PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h icmp.h \ inany.h isolation.h lineread.h log.h ndp.h netlink.h packet.h passt.h \ pasta.h pcap.h pif.h port_fwd.h siphash.h tap.h tcp.h tcp_conn.h \ tcp_splice.h udp.h util.h diff --git a/flow.c b/flow.c new file mode 100644 index 0000000..7c3603c --- /dev/null +++ b/flow.c @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson + * + * Tracking for logical "flows" of packets. + */ + +#include + +#include "flow.h" + +const char *flow_type_str[] = { + [FLOW_TYPE_NONE] = "<none>", + [FLOW_TCP] = "TCP connection", + [FLOW_TCP_SPLICE] = "TCP connection (spliced)", +}; diff --git a/flow.h b/flow.h new file mode 100644 index 0000000..88e6f0b --- /dev/null +++ b/flow.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson + * + * Tracking for logical "flows" of packets. + */ +#ifndef FLOW_H +#define FLOW_H + +/** + * enum flow_type - Different types of packet flows we track + */ +enum flow_type { + /* Represents an invalid or unused flow */ + FLOW_TYPE_NONE = 0, + /* A TCP connection between a socket and tap interface */ + FLOW_TCP, + /* A TCP connection between a host socket and ns socket */ + FLOW_TCP_SPLICE, + + FLOW_TYPE_MAX = FLOW_TCP_SPLICE, +}; + +extern const char *flow_type_str[]; +#define FLOW_TYPE(f) \ + ((f)->type <= FLOW_TYPE_MAX ? flow_type_str[(f)->type] : "?") + +/** + * struct flow_common - Common fields for packet flows + * @type: Type of packet flow + */ +struct flow_common { + uint8_t type; +}; + +#endif /* FLOW_H */ diff --git a/tcp.c b/tcp.c index 40e3dec..1c25032 100644 --- a/tcp.c +++ b/tcp.c @@ -299,6 +299,7 @@ #include "tcp_splice.h" #include "log.h" #include "inany.h" +#include "flow.h" #include "tcp_conn.h" @@ -584,7 +585,7 @@ static inline struct tcp_tap_conn *conn_at_idx(int idx) { if ((idx < 0) || (idx >= TCP_MAX_CONNS)) return NULL; - ASSERT(!(CONN(idx)->c.spliced)); + ASSERT(CONN(idx)->f.type == FLOW_TCP); return CONN(idx); } @@ -1319,14 +1320,21 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole) from = tc + c->tcp.conn_count; memcpy(hole, from, sizeof(*hole)); - if (from->c.spliced) - tcp_splice_conn_update(c, &hole->splice); - else + switch (from->f.type) { + case FLOW_TCP: tcp_tap_conn_update(c, &from->tap, &hole->tap); + break; + case FLOW_TCP_SPLICE: + tcp_splice_conn_update(c, &hole->splice); + break; + default: + die("Unexpected %s in tcp_table_compact()", + FLOW_TYPE(&from->f)); + } - debug("TCP: table compaction (spliced=%d): old index %li, new index %li, " + debug("TCP: table compaction (%s): old index %li, new index %li, " "from: %p, to: %p", - from->c.spliced, CONN_IDX(from), CONN_IDX(hole), + FLOW_TYPE(&from->f), CONN_IDX(from), CONN_IDX(hole), (void *)from, (void *)hole); memset(from, 0, sizeof(*from)); @@ -1402,12 +1410,18 @@ void tcp_defer_handler(struct ctx *c) tcp_l2_data_buf_flush(c); for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - if (conn->c.spliced) { - if (conn->splice.flags & CLOSING) - tcp_splice_destroy(c, conn); - } else { + switch (conn->f.type) { + case FLOW_TCP: if (conn->tap.events == CLOSED) tcp_conn_destroy(c, conn); + break; + case FLOW_TCP_SPLICE: + if (conn->splice.flags & CLOSING) + tcp_splice_destroy(c, conn); + break; + default: + die("Unexpected %s in tcp_defer_handler()", + FLOW_TYPE(&conn->f)); } } } @@ -2017,7 +2031,7 @@ static void tcp_conn_from_tap(struct ctx *c, } conn = CONN(c->tcp.conn_count++); - conn->c.spliced = false; + conn->f.type = FLOW_TCP; conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2726,7 +2740,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, const struct sockaddr *sa, const struct timespec *now) { - conn->c.spliced = false; + conn->f.type = FLOW_TCP; conn->sock = s; conn->timer = -1; conn->ws_to_tap = conn->ws_from_tap = 0; @@ -2909,10 +2923,17 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) { union tcp_conn *conn = tc + ref.tcp.index; - if (conn->c.spliced) - tcp_splice_sock_handler(c, &conn->splice, ref.fd, events); - else + switch (conn->f.type) { + case FLOW_TCP: tcp_tap_sock_handler(c, &conn->tap, events); + break; + case FLOW_TCP_SPLICE: + tcp_splice_sock_handler(c, &conn->splice, ref.fd, events); + break; + default: + die("Unexpected %s in tcp_sock_handler_compact()", + FLOW_TYPE(&conn->f)); + } } /** @@ -3244,11 +3265,17 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) } for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - if (conn->c.spliced) { - tcp_splice_timer(c, conn); - } else { + switch (conn->f.type) { + case FLOW_TCP: if (conn->tap.events == CLOSED) tcp_conn_destroy(c, conn); + break; + case FLOW_TCP_SPLICE: + tcp_splice_timer(c, conn); + break; + default: + die("Unexpected %s in tcp_timer()", + FLOW_TYPE(&conn->f)); } } diff --git a/tcp_conn.h b/tcp_conn.h index 0c6a35b..136f8da 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -9,19 +9,9 @@ #ifndef TCP_CONN_H #define TCP_CONN_H -/** - * struct tcp_conn_common - Common fields for spliced and non-spliced - * @spliced: Is this a spliced connection? - */ -struct tcp_conn_common { - bool spliced :1; -}; - -extern const char *tcp_common_flag_str[]; - /** * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced) - * @c: Fields common with tcp_splice_conn + * @f: Generic flow information * @in_epoll: Is the connection in the epoll set? * @next_index: Connection index of next item in hash chain, -1 for none * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS @@ -46,8 +36,8 @@ extern const char *tcp_common_flag_str[]; * @seq_init_from_tap: Initial sequence number from tap */ struct tcp_tap_conn { - /* Must be first element to match tcp_splice_conn */ - struct tcp_conn_common c; + /* Must be first element */ + struct flow_common f; bool in_epoll :1; int next_index :TCP_CONN_INDEX_BITS + 2; @@ -121,7 +111,7 @@ struct tcp_tap_conn { #define SIDES 2 /** * struct tcp_splice_conn - Descriptor for a spliced TCP connection - * @c: Fields common with tcp_tap_conn + * @f: Generic flow information * @in_epoll: Is the connection in the epoll set? * @s: File descriptor for sockets * @pipe: File descriptors for pipes @@ -131,8 +121,8 @@ struct tcp_tap_conn { * @written: Bytes written (not fully written from one other side read) */ struct tcp_splice_conn { - /* Must be first element to match tcp_tap_conn */ - struct tcp_conn_common c; + /* Must be first element */ + struct flow_common f; bool in_epoll :1; int s[SIDES]; @@ -168,7 +158,7 @@ struct tcp_splice_conn { * @splice: Fields specific to spliced connections */ union tcp_conn { - struct tcp_conn_common c; + struct flow_common f; struct tcp_tap_conn tap; struct tcp_splice_conn splice; }; diff --git a/tcp_splice.c b/tcp_splice.c index a5c1332..bfd2bd1 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -54,6 +54,7 @@ #include "tcp_splice.h" #include "siphash.h" #include "inany.h" +#include "flow.h" #include "tcp_conn.h" @@ -476,7 +477,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s); - conn->c.spliced = true; + conn->f.type = FLOW_TCP_SPLICE; conn->s[0] = s; if (tcp_splice_new(c, conn, ref.port, ref.pif)) -- 2.43.0

David Gibson

12:33 a.m.

New subject: [PATCH v2 02/11] flow, tcp: Move TCP connection table to unified flow table

We want to generalise "connection" tracking to things other than true TCP connections. Continue implenenting this by renaming the TCP connection table to the "flow table" and moving it to flow.c. The definitions are split between flow.h and flow_table.h - we need this separation to avoid circular dependencies: the definitions in flow.h will be needed by many headers using the flow mechanism, but flow_table.h needs all those protocol specific headers in order to define the full flow table entry. Signed-off-by: David Gibson --- Makefile | 8 ++--- flow.c | 11 +++++++ flow.h | 8 +++++ flow_table.h | 25 +++++++++++++++ passt.h | 3 ++ tcp.c | 87 +++++++++++++++++++++++++--------------------------- tcp.h | 5 --- tcp_conn.h | 23 +++----------- tcp_splice.c | 19 ++++++------ 9 files changed, 107 insertions(+), 82 deletions(-) create mode 100644 flow_table.h diff --git a/Makefile b/Makefile index e2970e3..d14b029 100644 --- a/Makefile +++ b/Makefile @@ -52,10 +52,10 @@ SRCS = $(PASST_SRCS) $(QRAP_SRCS) MANPAGES = passt.1 pasta.1 qrap.1 -PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h icmp.h \ - inany.h isolation.h lineread.h log.h ndp.h netlink.h packet.h passt.h \ - pasta.h pcap.h pif.h port_fwd.h siphash.h tap.h tcp.h tcp_conn.h \ - tcp_splice.h udp.h util.h +PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h \ + flow_table.h icmp.h inany.h isolation.h lineread.h log.h ndp.h \ + netlink.h packet.h passt.h pasta.h pcap.h pif.h port_fwd.h siphash.h \ + tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/flow.c b/flow.c index 7c3603c..a9bb8f5 100644 --- a/flow.c +++ b/flow.c @@ -6,11 +6,22 @@ */ #include +#include +#include +#include "util.h" +#include "passt.h" +#include "siphash.h" +#include "inany.h" #include "flow.h" +#include "tcp_conn.h" +#include "flow_table.h" const char *flow_type_str[] = { [FLOW_TYPE_NONE] = "<none>", [FLOW_TCP] = "TCP connection", [FLOW_TCP_SPLICE] = "TCP connection (spliced)", }; + +/* Global Flow Table */ +union flow flowtab[FLOW_MAX]; diff --git a/flow.h b/flow.h index 88e6f0b..57289a8 100644 --- a/flow.h +++ b/flow.h @@ -33,4 +33,12 @@ struct flow_common { uint8_t type; }; +#define FLOW_INDEX_BITS 17 /* 128k - 1 */ +#define FLOW_MAX MAX_FROM_BITS(FLOW_INDEX_BITS) + +#define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ +#define FLOW_FILE_PRESSURE 30 /* % of c->nofile */ + +union flow; + #endif /* FLOW_H */ diff --git a/flow_table.h b/flow_table.h new file mode 100644 index 0000000..c4c646b --- /dev/null +++ b/flow_table.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson + * + * Definitions for the global table of packet flows. + */ +#ifndef FLOW_TABLE_H +#define FLOW_TABLE_H + +/** + * union flow - Descriptor for a logical packet flow (e.g. connection) + * @f: Fields common between all variants + * @tcp: Fields for non-spliced TCP connections + * @tcp_splice: Fields for spliced TCP connections +*/ +union flow { + struct flow_common f; + struct tcp_tap_conn tcp; + struct tcp_splice_conn tcp_splice; +}; + +/* Global Flow Table */ +extern union flow flowtab[]; + +#endif /* FLOW_TABLE_H */ diff --git a/passt.h b/passt.h index cac720a..3f5dfb9 100644 --- a/passt.h +++ b/passt.h @@ -219,6 +219,7 @@ struct ip6_ctx { * @pasta_conf_ns: Configure namespace after creating it * @no_copy_routes: Don't copy all routes when configuring target namespace * @no_copy_addrs: Don't copy all addresses when configuring namespace + * @flow_count: Number of tracked packet flows (connections etc.) * @no_tcp: Disable TCP operation * @tcp: Context for TCP protocol handler * @no_tcp: Disable UDP operation @@ -277,6 +278,8 @@ struct ctx { int no_copy_routes; int no_copy_addrs; + unsigned flow_count; + int no_tcp; struct tcp_ctx tcp; int no_udp; diff --git a/tcp.c b/tcp.c index 1c25032..0119bd3 100644 --- a/tcp.c +++ b/tcp.c @@ -302,14 +302,14 @@ #include "flow.h" #include "tcp_conn.h" +#include "flow_table.h" #define TCP_FRAMES_MEM 128 #define TCP_FRAMES \ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) #define TCP_HASH_TABLE_LOAD 70 /* % */ -#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \ - TCP_HASH_TABLE_LOAD) +#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) #define MAX_WS 8 #define MAX_WINDOW (1 << (16 + (MAX_WS))) @@ -570,11 +570,8 @@ tcp6_l2_flags_buf[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_flags_buf_used; -/* TCP connections */ -union tcp_conn tc[TCP_MAX_CONNS]; - -#define CONN(idx) (&tc[(idx)].tap) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN(idx) (&flowtab[(idx)].tcp) +#define CONN_IDX(conn) ((union flow *)(conn) - flowtab) /** conn_at_idx() - Find a connection by index, if present * @idx: Index of connection to lookup @@ -583,7 +580,7 @@ union tcp_conn tc[TCP_MAX_CONNS]; */ static inline struct tcp_tap_conn *conn_at_idx(int idx) { - if ((idx < 0) || (idx >= TCP_MAX_CONNS)) + if ((idx < 0) || (idx >= FLOW_MAX)) return NULL; ASSERT(CONN(idx)->f.type == FLOW_TCP); return CONN(idx); @@ -1306,26 +1303,26 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, * @c: Execution context * @hole: Pointer to recently closed connection */ -void tcp_table_compact(struct ctx *c, union tcp_conn *hole) +void tcp_table_compact(struct ctx *c, union flow *hole) { - union tcp_conn *from; + union flow *from; - if (CONN_IDX(hole) == --c->tcp.conn_count) { + if (CONN_IDX(hole) == --c->flow_count) { debug("TCP: table compaction: maximum index was %li (%p)", CONN_IDX(hole), (void *)hole); memset(hole, 0, sizeof(*hole)); return; } - from = tc + c->tcp.conn_count; + from = flowtab + c->flow_count; memcpy(hole, from, sizeof(*hole)); switch (from->f.type) { case FLOW_TCP: - tcp_tap_conn_update(c, &from->tap, &hole->tap); + tcp_tap_conn_update(c, &from->tcp, &hole->tcp); break; case FLOW_TCP_SPLICE: - tcp_splice_conn_update(c, &hole->splice); + tcp_splice_conn_update(c, &hole->tcp_splice); break; default: die("Unexpected %s in tcp_table_compact()", @@ -1343,18 +1340,18 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole) /** * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction * @c: Execution context - * @conn_union: Connection pointer (container union) + * @flow: Flow table entry for this connection */ -static void tcp_conn_destroy(struct ctx *c, union tcp_conn *conn_union) +static void tcp_conn_destroy(struct ctx *c, union flow *flow) { - const struct tcp_tap_conn *conn = &conn_union->tap; + const struct tcp_tap_conn *conn = &flow->tcp; close(conn->sock); if (conn->timer != -1) close(conn->timer); tcp_hash_remove(c, conn); - tcp_table_compact(c, conn_union); + tcp_table_compact(c, flow); } static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); @@ -1404,24 +1401,24 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) */ void tcp_defer_handler(struct ctx *c) { - union tcp_conn *conn; + union flow *flow; tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); - for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - switch (conn->f.type) { + for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { + switch (flow->f.type) { case FLOW_TCP: - if (conn->tap.events == CLOSED) - tcp_conn_destroy(c, conn); + if (flow->tcp.events == CLOSED) + tcp_conn_destroy(c, flow); break; case FLOW_TCP_SPLICE: - if (conn->splice.flags & CLOSING) - tcp_splice_destroy(c, conn); + if (flow->tcp_splice.flags & CLOSING) + tcp_splice_destroy(c, flow); break; default: die("Unexpected %s in tcp_defer_handler()", - FLOW_TYPE(&conn->f)); + FLOW_TYPE(&flow->f)); } } } @@ -2004,7 +2001,7 @@ static void tcp_conn_from_tap(struct ctx *c, (void)saddr; - if (c->tcp.conn_count >= TCP_MAX_CONNS) + if (c->flow_count >= FLOW_MAX) return; if ((s = tcp_conn_pool_sock(pool)) < 0) @@ -2030,7 +2027,7 @@ static void tcp_conn_from_tap(struct ctx *c, } } - conn = CONN(c->tcp.conn_count++); + conn = CONN(c->flow_count++); conn->f.type = FLOW_TCP; conn->sock = s; conn->timer = -1; @@ -2775,24 +2772,24 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, { struct sockaddr_storage sa; socklen_t sl = sizeof(sa); - union tcp_conn *conn; + union flow *flow; int s; - if (c->no_tcp || c->tcp.conn_count >= TCP_MAX_CONNS) + if (c->no_tcp || c->flow_count >= FLOW_MAX) return; s = accept4(ref.fd, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); if (s < 0) return; - conn = tc + c->tcp.conn_count++; + flow = flowtab + c->flow_count++; if (c->mode == MODE_PASTA && - tcp_splice_conn_from_sock(c, ref.tcp_listen, &conn->splice, + tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, s, (struct sockaddr *)&sa)) return; - tcp_tap_conn_from_sock(c, ref.tcp_listen, &conn->tap, s, + tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, (struct sockaddr *)&sa, now); } @@ -2921,18 +2918,18 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, */ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) { - union tcp_conn *conn = tc + ref.tcp.index; + union flow *flow = flowtab + ref.tcp.index; - switch (conn->f.type) { + switch (flow->f.type) { case FLOW_TCP: - tcp_tap_sock_handler(c, &conn->tap, events); + tcp_tap_sock_handler(c, &flow->tcp, events); break; case FLOW_TCP_SPLICE: - tcp_splice_sock_handler(c, &conn->splice, ref.fd, events); + tcp_splice_sock_handler(c, &flow->tcp_splice, ref.fd, events); break; default: die("Unexpected %s in tcp_sock_handler_compact()", - FLOW_TYPE(&conn->f)); + FLOW_TYPE(&flow->f)); } } @@ -3248,7 +3245,7 @@ static int tcp_port_rebind_outbound(void *arg) */ void tcp_timer(struct ctx *c, const struct timespec *ts) { - union tcp_conn *conn; + union flow *flow; (void)ts; @@ -3264,18 +3261,18 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) } } - for (conn = tc + c->tcp.conn_count - 1; conn >= tc; conn--) { - switch (conn->f.type) { + for (flow = flowtab + c->flow_count - 1; flow >= flowtab; flow--) { + switch (flow->f.type) { case FLOW_TCP: - if (conn->tap.events == CLOSED) - tcp_conn_destroy(c, conn); + if (flow->tcp.events == CLOSED) + tcp_conn_destroy(c, flow); break; case FLOW_TCP_SPLICE: - tcp_splice_timer(c, conn); + tcp_splice_timer(c, flow); break; default: die("Unexpected %s in tcp_timer()", - FLOW_TYPE(&conn->f)); + FLOW_TYPE(&flow->f)); } } diff --git a/tcp.h b/tcp.h index 06965d8..c8b738d 100644 --- a/tcp.h +++ b/tcp.h @@ -8,9 +8,6 @@ #define TCP_TIMER_INTERVAL 1000 /* ms */ -#define TCP_CONN_INDEX_BITS 17 /* 128k - 1 */ -#define TCP_MAX_CONNS MAX_FROM_BITS(TCP_CONN_INDEX_BITS) - struct ctx; void tcp_timer_handler(struct ctx *c, union epoll_ref ref); @@ -56,7 +53,6 @@ union tcp_listen_epoll_ref { /** * struct tcp_ctx - Execution context for TCP routines * @hash_secret: 128-bit secret for hash functions, ISN and hash table - * @conn_count: Count of total connections in connection table * @port_to_tap: Ports bound host-side, packets to tap or spliced * @fwd_in: Port forwarding configuration for inbound packets * @fwd_out: Port forwarding configuration for outbound packets @@ -66,7 +62,6 @@ union tcp_listen_epoll_ref { */ struct tcp_ctx { uint64_t hash_secret[2]; - int conn_count; struct port_fwd fwd_in; struct port_fwd fwd_out; struct timespec timer_run; diff --git a/tcp_conn.h b/tcp_conn.h index 136f8da..5a107fc 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -40,7 +40,7 @@ struct tcp_tap_conn { struct flow_common f; bool in_epoll :1; - int next_index :TCP_CONN_INDEX_BITS + 2; + int next_index :FLOW_INDEX_BITS + 2; #define TCP_RETRANS_BITS 3 unsigned int retrans :TCP_RETRANS_BITS; @@ -151,21 +151,6 @@ struct tcp_splice_conn { uint32_t written[SIDES]; }; -/** - * union tcp_conn - Descriptor for a TCP connection (spliced or non-spliced) - * @c: Fields common between all variants - * @tap: Fields specific to non-spliced connections - * @splice: Fields specific to spliced connections -*/ -union tcp_conn { - struct flow_common f; - struct tcp_tap_conn tap; - struct tcp_splice_conn splice; -}; - -/* TCP connections */ -extern union tcp_conn tc[]; - /* Socket pools */ #define TCP_SOCK_POOL_SIZE 32 @@ -173,9 +158,9 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); -void tcp_table_compact(struct ctx *c, union tcp_conn *hole); -void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union); -void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union); +void tcp_table_compact(struct ctx *c, union flow *hole); +void tcp_splice_destroy(struct ctx *c, union flow *flow); +void tcp_splice_timer(struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af); diff --git a/tcp_splice.c b/tcp_splice.c index bfd2bd1..9f4831a 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -57,6 +57,7 @@ #include "flow.h" #include "tcp_conn.h" +#include "flow_table.h" #define MAX_PIPE_SIZE (8UL * 1024 * 1024) #define TCP_SPLICE_PIPE_POOL_SIZE 32 @@ -76,7 +77,7 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2]; #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) #define CONN(idx) (&tc[(idx)].splice) -#define CONN_IDX(conn) ((union tcp_conn *)(conn) - tc) +#define CONN_IDX(conn) ((union flow *)(conn) - flowtab) /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -254,11 +255,11 @@ void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new) /** * tcp_splice_destroy() - Close spliced connection and pipes, clear * @c: Execution context - * @conn_union: Spliced connection (container union) + * @flow: Flow table entry */ -void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union) +void tcp_splice_destroy(struct ctx *c, union flow *flow) { - struct tcp_splice_conn *conn = &conn_union->splice; + struct tcp_splice_conn *conn = &flow->tcp_splice; int side; for (side = 0; side < SIDES; side++) { @@ -283,7 +284,7 @@ void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union) conn->flags = 0; debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn)); - tcp_table_compact(c, conn_union); + tcp_table_compact(c, flow); } /** @@ -775,15 +776,15 @@ void tcp_splice_init(struct ctx *c) /** * tcp_splice_timer() - Timer for spliced connections * @c: Execution context - * @conn_union: Spliced connection (container union) + * @flow: Flow table entry */ -void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union) +void tcp_splice_timer(struct ctx *c, union flow *flow) { - struct tcp_splice_conn *conn = &conn_union->splice; + struct tcp_splice_conn *conn = &flow->tcp_splice; int side; if (conn->flags & CLOSING) { - tcp_splice_destroy(c, conn_union); + tcp_splice_destroy(c, flow); return; } -- 2.43.0

David Gibson

12:33 a.m.

New subject: [PATCH v2 03/11] flow, tcp: Consolidate flow pointer<->index helpers

Both tcp.c and tcp_splice.c define CONN_IDX() variants to find the index of their connection structures in the connection table, now become the unified flow table. We can easily combine these into a common helper. While we're there, add some trickery for some additional type safety. They also define their own CONN() versions, which aren't so easily combined since they need to return different types, but we can have them use a common helper. In the process, we standardise on always using an unsigned type to store the connection / flow index, which makes more sense. tcp.c's conn_at_idx() remains for now, but we change its parameter to unsigned to match. That in turn means we can remove a check for negative values from it. Signed-off-by: David Gibson --- flow_table.h | 25 ++++++++++++++++++++ tcp.c | 65 ++++++++++++++++++++++++++-------------------------- tcp_conn.h | 2 +- tcp_splice.c | 21 ++++++++--------- 4 files changed, 68 insertions(+), 45 deletions(-) diff --git a/flow_table.h b/flow_table.h index c4c646b..5e897bd 100644 --- a/flow_table.h +++ b/flow_table.h @@ -22,4 +22,29 @@ union flow { /* Global Flow Table */ extern union flow flowtab[]; + +/** flow_idx - Index of flow from common structure + * @f: Common flow fields pointer + * + * Return: index of @f in the flow table + */ +static inline unsigned flow_idx(const struct flow_common *f) +{ + return (union flow *)f - flowtab; +} + +/** FLOW_IDX - Find the index of a flow + * @f_: Flow pointer, either union flow * or protocol specific + * + * Return: index of @f in the flow table + */ +#define FLOW_IDX(f_) (flow_idx(&(f_)->f)) + +/** FLOW - Flow entry at a given index + * @idx: Flow index + * + * Return: pointer to entry @idx in the flow table + */ +#define FLOW(idx) (&flowtab[(idx)]) + #endif /* FLOW_TABLE_H */ diff --git a/tcp.c b/tcp.c index 0119bd3..859df6f 100644 --- a/tcp.c +++ b/tcp.c @@ -570,17 +570,16 @@ tcp6_l2_flags_buf[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_flags_buf_used; -#define CONN(idx) (&flowtab[(idx)].tcp) -#define CONN_IDX(conn) ((union flow *)(conn) - flowtab) +#define CONN(idx) (&(FLOW(idx)->tcp)) /** conn_at_idx() - Find a connection by index, if present * @idx: Index of connection to lookup * * Return: pointer to connection, or NULL if @idx is out of bounds */ -static inline struct tcp_tap_conn *conn_at_idx(int idx) +static inline struct tcp_tap_conn *conn_at_idx(unsigned idx) { - if ((idx < 0) || (idx >= FLOW_MAX)) + if (idx >= FLOW_MAX) return NULL; ASSERT(CONN(idx)->f.type == FLOW_TCP); return CONN(idx); @@ -640,7 +639,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .tcp.index = CONN_IDX(conn) }; + .tcp.index = FLOW_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { @@ -661,7 +660,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->timer != -1) { union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, .fd = conn->sock, - .tcp.index = CONN_IDX(conn) }; + .tcp.index = FLOW_IDX(conn) }; struct epoll_event ev_t = { .data.u64 = ref_t.u64, .events = EPOLLIN | EPOLLET }; @@ -689,7 +688,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->timer == -1) { union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER, .fd = conn->sock, - .tcp.index = CONN_IDX(conn) }; + .tcp.index = FLOW_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64, .events = EPOLLIN | EPOLLET }; int fd; @@ -725,7 +724,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) it.it_value.tv_sec = ACT_TIMEOUT; } - debug("TCP: index %li, timer expires in %lu.%03lus", CONN_IDX(conn), + debug("TCP: index %u, timer expires in %lu.%03lus", FLOW_IDX(conn), it.it_value.tv_sec, it.it_value.tv_nsec / 1000 / 1000); timerfd_settime(conn->timer, 0, &it, NULL); @@ -748,7 +747,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, conn->flags &= flag; if (flag_index >= 0) { - debug("TCP: index %li: %s dropped", CONN_IDX(conn), + debug("TCP: index %u: %s dropped", FLOW_IDX(conn), tcp_flag_str[flag_index]); } } else { @@ -769,7 +768,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, conn->flags |= flag; if (flag_index >= 0) { - debug("TCP: index %li: %s", CONN_IDX(conn), + debug("TCP: index %u: %s", FLOW_IDX(conn), tcp_flag_str[flag_index]); } } @@ -819,12 +818,12 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, new += 5; if (prev != new) { - debug("TCP: index %li, %s: %s -> %s", CONN_IDX(conn), + debug("TCP: index %u, %s: %s -> %s", FLOW_IDX(conn), num == -1 ? "CLOSED" : tcp_event_str[num], prev == -1 ? "CLOSED" : tcp_state_str[prev], (new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]); } else { - debug("TCP: index %li, %s", CONN_IDX(conn), + debug("TCP: index %u, %s", FLOW_IDX(conn), num == -1 ? "CLOSED" : tcp_event_str[num]); } @@ -1204,11 +1203,11 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) int b; b = tcp_hash(c, &conn->faddr, conn->eport, conn->fport); - conn->next_index = tc_hash[b] ? CONN_IDX(tc_hash[b]) : -1; + conn->next_index = tc_hash[b] ? FLOW_IDX(tc_hash[b]) : -1U; tc_hash[b] = conn; - debug("TCP: hash table insert: index %li, sock %i, bucket: %i, next: " - "%p", CONN_IDX(conn), conn->sock, b, + debug("TCP: hash table insert: index %u, sock %i, bucket: %i, next: " + "%p", FLOW_IDX(conn), conn->sock, b, (void *)conn_at_idx(conn->next_index)); } @@ -1234,8 +1233,8 @@ static void tcp_hash_remove(const struct ctx *c, } } - debug("TCP: hash table remove: index %li, sock %i, bucket: %i, new: %p", - CONN_IDX(conn), conn->sock, b, + debug("TCP: hash table remove: index %u, sock %i, bucket: %i, new: %p", + FLOW_IDX(conn), conn->sock, b, (void *)(prev ? conn_at_idx(prev->next_index) : tc_hash[b])); } @@ -1255,16 +1254,16 @@ static void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, prev = entry, entry = conn_at_idx(entry->next_index)) { if (entry == old) { if (prev) - prev->next_index = CONN_IDX(new); + prev->next_index = FLOW_IDX(new); else tc_hash[b] = new; break; } } - debug("TCP: hash table update: old index %li, new index %li, sock %i, " + debug("TCP: hash table update: old index %u, new index %u, sock %i, " "bucket: %i, old: %p, new: %p", - CONN_IDX(old), CONN_IDX(new), new->sock, b, + FLOW_IDX(old), FLOW_IDX(new), new->sock, b, (void *)old, (void *)new); tcp_epoll_ctl(c, new); @@ -1307,9 +1306,9 @@ void tcp_table_compact(struct ctx *c, union flow *hole) { union flow *from; - if (CONN_IDX(hole) == --c->flow_count) { - debug("TCP: table compaction: maximum index was %li (%p)", - CONN_IDX(hole), (void *)hole); + if (FLOW_IDX(hole) == --c->flow_count) { + debug("TCP: table compaction: maximum index was %u (%p)", + FLOW_IDX(hole), (void *)hole); memset(hole, 0, sizeof(*hole)); return; } @@ -1329,9 +1328,9 @@ void tcp_table_compact(struct ctx *c, union flow *hole) FLOW_TYPE(&from->f)); } - debug("TCP: table compaction (%s): old index %li, new index %li, " + debug("TCP: table compaction (%s): old index %u, new index %u, " "from: %p, to: %p", - FLOW_TYPE(&from->f), CONN_IDX(from), CONN_IDX(hole), + FLOW_TYPE(&from->f), FLOW_IDX(from), FLOW_IDX(hole), (void *)from, (void *)hole); memset(from, 0, sizeof(*from)); @@ -1357,7 +1356,7 @@ static void tcp_conn_destroy(struct ctx *c, union flow *flow) static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); #define tcp_rst(c, conn) \ do { \ - debug("TCP: index %li, reset at %s:%i", CONN_IDX(conn), \ + debug("TCP: index %u, reset at %s:%i", FLOW_IDX(conn), \ __func__, __LINE__); \ tcp_rst_do(c, conn); \ } while (0) @@ -2581,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, return 1; } - trace("TCP: packet length %lu from tap for index %lu", len, CONN_IDX(conn)); + trace("TCP: packet length %lu from tap for index %u", len, FLOW_IDX(conn)); if (th->rst) { conn_event(c, conn, CLOSED); @@ -2821,17 +2820,17 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) tcp_timer_ctl(c, conn); } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) { - debug("TCP: index %li, handshake timeout", CONN_IDX(conn)); + debug("TCP: index %u, handshake timeout", FLOW_IDX(conn)); tcp_rst(c, conn); } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { - debug("TCP: index %li, FIN timeout", CONN_IDX(conn)); + debug("TCP: index %u, FIN timeout", FLOW_IDX(conn)); tcp_rst(c, conn); } else if (conn->retrans == TCP_MAX_RETRANS) { - debug("TCP: index %li, retransmissions count exceeded", - CONN_IDX(conn)); + debug("TCP: index %u, retransmissions count exceeded", + FLOW_IDX(conn)); tcp_rst(c, conn); } else { - debug("TCP: index %li, ACK timeout, retry", CONN_IDX(conn)); + debug("TCP: index %u, ACK timeout, retry", FLOW_IDX(conn)); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; tcp_data_from_sock(c, conn); @@ -2849,7 +2848,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) */ timerfd_settime(conn->timer, 0, &new, &old); if (old.it_value.tv_sec == ACT_TIMEOUT) { - debug("TCP: index %li, activity timeout", CONN_IDX(conn)); + debug("TCP: index %u, activity timeout", FLOW_IDX(conn)); tcp_rst(c, conn); } } diff --git a/tcp_conn.h b/tcp_conn.h index 5a107fc..5a9376e 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -40,7 +40,7 @@ struct tcp_tap_conn { struct flow_common f; bool in_epoll :1; - int next_index :FLOW_INDEX_BITS + 2; + unsigned next_index :FLOW_INDEX_BITS + 2; #define TCP_RETRANS_BITS 3 unsigned int retrans :TCP_RETRANS_BITS; diff --git a/tcp_splice.c b/tcp_splice.c index 9f4831a..3955417 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -76,8 +76,7 @@ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2]; #define CONN_V6(x) (x->flags & SPLICE_V6) #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) -#define CONN(idx) (&tc[(idx)].splice) -#define CONN_IDX(conn) ((union flow *)(conn) - flowtab) +#define CONN(idx) (&FLOW(idx)->tcp_splice) /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { @@ -129,8 +128,8 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref[SIDES] = { - { .type = EPOLL_TYPE_TCP, .fd = conn->s[0], .tcp.index = CONN_IDX(conn) }, - { .type = EPOLL_TYPE_TCP, .fd = conn->s[1], .tcp.index = CONN_IDX(conn) } + { .type = EPOLL_TYPE_TCP, .fd = conn->s[0], .tcp.index = FLOW_IDX(conn) }, + { .type = EPOLL_TYPE_TCP, .fd = conn->s[1], .tcp.index = FLOW_IDX(conn) } }; struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, { .data.u64 = ref[1].u64 } }; @@ -140,8 +139,8 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) || epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) { int ret = -errno; - err("TCP (spliced): index %li, ERROR on epoll_ctl(): %s", - CONN_IDX(conn), strerror(errno)); + err("TCP (spliced): index %u, ERROR on epoll_ctl(): %s", + FLOW_IDX(conn), strerror(errno)); return ret; } @@ -167,7 +166,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, conn->flags &= flag; if (flag_index >= 0) { - debug("TCP (spliced): index %li: %s dropped", CONN_IDX(conn), + debug("TCP (spliced): index %u: %s dropped", FLOW_IDX(conn), tcp_splice_flag_str[flag_index]); } } else { @@ -178,7 +177,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, conn->flags |= flag; if (flag_index >= 0) { - debug("TCP (spliced): index %li: %s", CONN_IDX(conn), + debug("TCP (spliced): index %u: %s", FLOW_IDX(conn), tcp_splice_flag_str[flag_index]); } } @@ -213,7 +212,7 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, conn->events &= event; if (flag_index >= 0) { - debug("TCP (spliced): index %li, ~%s", CONN_IDX(conn), + debug("TCP (spliced): index %u, ~%s", FLOW_IDX(conn), tcp_splice_event_str[flag_index]); } } else { @@ -224,7 +223,7 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, conn->events |= event; if (flag_index >= 0) { - debug("TCP (spliced): index %li, %s", CONN_IDX(conn), + debug("TCP (spliced): index %u, %s", FLOW_IDX(conn), tcp_splice_event_str[flag_index]); } } @@ -282,7 +281,7 @@ void tcp_splice_destroy(struct ctx *c, union flow *flow) conn->events = SPLICE_CLOSED; conn->flags = 0; - debug("TCP (spliced): index %li, CLOSED", CONN_IDX(conn)); + debug("TCP (spliced): index %u, CLOSED", FLOW_IDX(conn)); tcp_table_compact(c, flow); } -- 2.43.0

David Gibson

12:33 a.m.

New subject: [PATCH v2 04/11] util: MAX_FROM_BITS() should be unsigned

MAX_FROM_BITS() computes the maximum value representable in a number of bits. The expression for that is an unsigned value, but we explicitly cast it to a signed int. It looks like this is because one of the main users is for FD_REF_MAX, which is used to bound fd values, typically stored as a signed int. The value MAX_FROM_BITS() is calculating is naturally non-negative, though, so it makes more sense for it to be unsigned, and to move the case to the definition of FD_REF_MAX. Signed-off-by: David Gibson --- passt.h | 2 +- util.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/passt.h b/passt.h index 3f5dfb9..0fce637 100644 --- a/passt.h +++ b/passt.h @@ -87,7 +87,7 @@ union epoll_ref { struct { enum epoll_type type:8; #define FD_REF_BITS 24 -#define FD_REF_MAX MAX_FROM_BITS(FD_REF_BITS) +#define FD_REF_MAX ((int)MAX_FROM_BITS(FD_REF_BITS)) int32_t fd:FD_REF_BITS; union { union tcp_epoll_ref tcp; diff --git a/util.h b/util.h index 78a8fb2..b1106e8 100644 --- a/util.h +++ b/util.h @@ -42,7 +42,7 @@ #define ROUND_DOWN(x, y) ((x) & ~((y) - 1)) #define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1)) -#define MAX_FROM_BITS(n) ((int)((1U << (n)) - 1)) +#define MAX_FROM_BITS(n) (((1U << (n)) - 1)) #define BIT(n) (1UL << (n)) #define BITMAP_BIT(n) (BIT((n) % (sizeof(long) * 8))) -- 2.43.0

David Gibson

12:33 a.m.

New subject: [PATCH v2 05/11] flow: Make unified version of flow table compaction

tcp_table_compact() will move entries in the connection/flow table to keep it compact when other entries are removed. The moved entries need not have the same type as the flow removed, so it needs to be able to handle moving any type of flow. Therefore, move it to flow.c rather than being purportedly TCP specific. Signed-off-by: David Gibson --- flow.c | 39 +++++++++++++++++++++++++++++++++++++++ flow.h | 2 ++ tcp.c | 46 ++++------------------------------------------ tcp_conn.h | 3 ++- tcp_splice.c | 2 +- 5 files changed, 48 insertions(+), 44 deletions(-) diff --git a/flow.c b/flow.c index a9bb8f5..0fff119 100644 --- a/flow.c +++ b/flow.c @@ -25,3 +25,42 @@ const char *flow_type_str[] = { /* Global Flow Table */ union flow flowtab[FLOW_MAX]; + +/** + * flow_table_compact() - Perform compaction on flow table + * @c: Execution context + * @hole: Pointer to recently closed flow + */ +void flow_table_compact(struct ctx *c, union flow *hole) +{ + union flow *from; + + if (FLOW_IDX(hole) == --c->flow_count) { + debug("flow: table compaction: maximum index was %u (%p)", + FLOW_IDX(hole), (void *)hole); + memset(hole, 0, sizeof(*hole)); + return; + } + + from = flowtab + c->flow_count; + memcpy(hole, from, sizeof(*hole)); + + switch (from->f.type) { + case FLOW_TCP: + tcp_tap_conn_update(c, &from->tcp, &hole->tcp); + break; + case FLOW_TCP_SPLICE: + tcp_splice_conn_update(c, &hole->tcp_splice); + break; + default: + die("Unexpected %s in tcp_table_compact()", + FLOW_TYPE(&from->f)); + } + + debug("flow: table compaction (%s): old index %u, new index %u, " + "from: %p, to: %p", + FLOW_TYPE(&from->f), FLOW_IDX(from), FLOW_IDX(hole), + (void *)from, (void *)hole); + + memset(from, 0, sizeof(*from)); +} diff --git a/flow.h b/flow.h index 57289a8..9f49195 100644 --- a/flow.h +++ b/flow.h @@ -41,4 +41,6 @@ struct flow_common { union flow; +void flow_table_compact(struct ctx *c, union flow *hole); + #endif /* FLOW_H */ diff --git a/tcp.c b/tcp.c index 859df6f..6924dd2 100644 --- a/tcp.c +++ b/tcp.c @@ -1244,8 +1244,9 @@ static void tcp_hash_remove(const struct ctx *c, * @old: Old location of tcp_tap_conn * @new: New location of tcp_tap_conn */ -static void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, - struct tcp_tap_conn *new) +void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, + struct tcp_tap_conn *new) + { struct tcp_tap_conn *entry, *prev = NULL; int b = tcp_conn_hash(c, old); @@ -1297,45 +1298,6 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, return NULL; } -/** - * tcp_table_compact() - Perform compaction on connection table - * @c: Execution context - * @hole: Pointer to recently closed connection - */ -void tcp_table_compact(struct ctx *c, union flow *hole) -{ - union flow *from; - - if (FLOW_IDX(hole) == --c->flow_count) { - debug("TCP: table compaction: maximum index was %u (%p)", - FLOW_IDX(hole), (void *)hole); - memset(hole, 0, sizeof(*hole)); - return; - } - - from = flowtab + c->flow_count; - memcpy(hole, from, sizeof(*hole)); - - switch (from->f.type) { - case FLOW_TCP: - tcp_tap_conn_update(c, &from->tcp, &hole->tcp); - break; - case FLOW_TCP_SPLICE: - tcp_splice_conn_update(c, &hole->tcp_splice); - break; - default: - die("Unexpected %s in tcp_table_compact()", - FLOW_TYPE(&from->f)); - } - - debug("TCP: table compaction (%s): old index %u, new index %u, " - "from: %p, to: %p", - FLOW_TYPE(&from->f), FLOW_IDX(from), FLOW_IDX(hole), - (void *)from, (void *)hole); - - memset(from, 0, sizeof(*from)); -} - /** * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction * @c: Execution context @@ -1350,7 +1312,7 @@ static void tcp_conn_destroy(struct ctx *c, union flow *flow) close(conn->timer); tcp_hash_remove(c, conn); - tcp_table_compact(c, flow); + flow_table_compact(c, flow); } static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); diff --git a/tcp_conn.h b/tcp_conn.h index 5a9376e..3900305 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -157,8 +157,9 @@ struct tcp_splice_conn { extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old, + struct tcp_tap_conn *new); void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new); -void tcp_table_compact(struct ctx *c, union flow *hole); void tcp_splice_destroy(struct ctx *c, union flow *flow); void tcp_splice_timer(struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); diff --git a/tcp_splice.c b/tcp_splice.c index 3955417..212fe16 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -283,7 +283,7 @@ void tcp_splice_destroy(struct ctx *c, union flow *flow) conn->flags = 0; debug("TCP (spliced): index %u, CLOSED", FLOW_IDX(conn)); - tcp_table_compact(c, flow); + flow_table_compact(c, flow); } /** -- 2.43.0

David Gibson

12:33 a.m.

New subject: [PATCH v2 06/11] flow, tcp: Add logging helpers for connection related messages

Most of the messages logged by the TCP code (be they errors, debug or trace messages) are related to a specific connection / flow. We're fairly consistent about prefixing these with the type of connection and the connection / flow index. However there are a few places where we put the index later in the message or omit it entirely. The template with the prefix is also a little bulky to carry around for every message, particularly for spliced connections. To help keep this consistent, introduce some helpers to log messages linked to a specific flow. It takes the flow as a parameter and adds a uniform prefix to each message. This makes things slightly neater now, but more importantly will help keep formatting consistent as we add more things to the flow table. Signed-off-by: David Gibson --- flow.c | 18 ++++++++++++ flow.h | 14 +++++++++ tcp.c | 81 ++++++++++++++++++++++++---------------------------- tcp_splice.c | 61 +++++++++++++++++---------------------- 4 files changed, 96 insertions(+), 78 deletions(-) diff --git a/flow.c b/flow.c index 0fff119..4d479c2 100644 --- a/flow.c +++ b/flow.c @@ -64,3 +64,21 @@ void flow_table_compact(struct ctx *c, union flow *hole) memset(from, 0, sizeof(*from)); } + +/** flow_log_ - Log flow-related message + * @f: flow the message is related to + * @pri: Log priority + * @fmt: Format string + * @...: printf-arguments + */ +void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) +{ + char msg[BUFSIZ]; + va_list args; + + va_start(args, fmt); + (void)vsnprintf(msg, sizeof(msg), fmt, args); + va_end(args); + + logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); +} diff --git a/flow.h b/flow.h index 9f49195..b6da516 100644 --- a/flow.h +++ b/flow.h @@ -43,4 +43,18 @@ union flow; void flow_table_compact(struct ctx *c, union flow *hole); +void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); + +#define flow_log(f_, pri, ...) flow_log_(&(f_)->f, (pri), __VA_ARGS__) + +#define flow_dbg(f, ...) flow_log((f), LOG_DEBUG, __VA_ARGS__) +#define flow_err(f, ...) flow_log((f), LOG_ERR, __VA_ARGS__) + +#define flow_trace(f, ...) \ + do { \ + if (log_trace) \ + flow_dbg((f), __VA_ARGS__); \ + } while (0) + #endif /* FLOW_H */ diff --git a/tcp.c b/tcp.c index 6924dd2..f427047 100644 --- a/tcp.c +++ b/tcp.c @@ -624,7 +624,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, unsigned long flag); #define conn_flag(c, conn, flag) \ do { \ - trace("TCP: flag at %s:%i", __func__, __LINE__); \ + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ conn_flag_do(c, conn, flag); \ } while (0) @@ -695,7 +695,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) fd = timerfd_create(CLOCK_MONOTONIC, 0); if (fd == -1 || fd > FD_REF_MAX) { - debug("TCP: failed to get timer: %s", strerror(errno)); + flow_dbg(conn, "failed to get timer: %s", + strerror(errno)); if (fd > -1) close(fd); conn->timer = -1; @@ -704,7 +705,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) conn->timer = fd; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { - debug("TCP: failed to add timer: %s", strerror(errno)); + flow_dbg(conn, "failed to add timer: %s", + strerror(errno)); close(conn->timer); conn->timer = -1; return; @@ -724,8 +726,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) it.it_value.tv_sec = ACT_TIMEOUT; } - debug("TCP: index %u, timer expires in %lu.%03lus", FLOW_IDX(conn), - it.it_value.tv_sec, it.it_value.tv_nsec / 1000 / 1000); + flow_dbg(conn, "timer expires in %lu.%03lus", it.it_value.tv_sec, + it.it_value.tv_nsec / 1000 / 1000); timerfd_settime(conn->timer, 0, &it, NULL); } @@ -746,10 +748,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, return; conn->flags &= flag; - if (flag_index >= 0) { - debug("TCP: index %u: %s dropped", FLOW_IDX(conn), - tcp_flag_str[flag_index]); - } + if (flag_index >= 0) + flow_dbg(conn, "%s dropped", tcp_flag_str[flag_index]); } else { int flag_index = fls(flag); @@ -767,10 +767,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, } conn->flags |= flag; - if (flag_index >= 0) { - debug("TCP: index %u: %s", FLOW_IDX(conn), - tcp_flag_str[flag_index]); - } + if (flag_index >= 0) + flow_dbg(conn, "%s", tcp_flag_str[flag_index]); } if (flag == STALLED || flag == ~STALLED) @@ -817,15 +815,14 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, if (conn->flags & ACTIVE_CLOSE) new += 5; - if (prev != new) { - debug("TCP: index %u, %s: %s -> %s", FLOW_IDX(conn), - num == -1 ? "CLOSED" : tcp_event_str[num], - prev == -1 ? "CLOSED" : tcp_state_str[prev], - (new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]); - } else { - debug("TCP: index %u, %s", FLOW_IDX(conn), - num == -1 ? "CLOSED" : tcp_event_str[num]); - } + if (prev != new) + flow_dbg(conn, "%s: %s -> %s", + num == -1 ? "CLOSED" : tcp_event_str[num], + prev == -1 ? "CLOSED" : tcp_state_str[prev], + (new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]); + else + flow_dbg(conn, "%s", + num == -1 ? "CLOSED" : tcp_event_str[num]); if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) conn_flag(c, conn, ACTIVE_CLOSE); @@ -838,7 +835,7 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, #define conn_event(c, conn, event) \ do { \ - trace("TCP: event at %s:%i", __func__, __LINE__); \ + flow_trace(conn, "event at %s:%i", __func__, __LINE__); \ conn_event_do(c, conn, event); \ } while (0) @@ -1206,9 +1203,8 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) conn->next_index = tc_hash[b] ? FLOW_IDX(tc_hash[b]) : -1U; tc_hash[b] = conn; - debug("TCP: hash table insert: index %u, sock %i, bucket: %i, next: " - "%p", FLOW_IDX(conn), conn->sock, b, - (void *)conn_at_idx(conn->next_index)); + flow_dbg(conn, "hash table insert: sock %i, bucket: %i, next: %p", + conn->sock, b, (void *)conn_at_idx(conn->next_index)); } /** @@ -1233,9 +1229,9 @@ static void tcp_hash_remove(const struct ctx *c, } } - debug("TCP: hash table remove: index %u, sock %i, bucket: %i, new: %p", - FLOW_IDX(conn), conn->sock, b, - (void *)(prev ? conn_at_idx(prev->next_index) : tc_hash[b])); + flow_dbg(conn, "hash table remove: sock %i, bucket: %i, new: %p", + conn->sock, b, + (void *)(prev ? conn_at_idx(prev->next_index) : tc_hash[b])); } /** @@ -1318,8 +1314,7 @@ static void tcp_conn_destroy(struct ctx *c, union flow *flow) static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); #define tcp_rst(c, conn) \ do { \ - debug("TCP: index %u, reset at %s:%i", FLOW_IDX(conn), \ - __func__, __LINE__); \ + flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ tcp_rst_do(c, conn); \ } while (0) @@ -1998,7 +1993,7 @@ static void tcp_conn_from_tap(struct ctx *c, mss = tcp_conn_tap_mss(conn, opts, optlen); if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss))) - trace("TCP: failed to set TCP_MAXSEG on socket %i", s); + flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s); MSS_SET(conn, mss); tcp_get_tap_ws(conn, opts, optlen); @@ -2159,8 +2154,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) if (SEQ_LT(already_sent, 0)) { /* RFC 761, section 2.1. */ - trace("TCP: ACK sequence gap: ACK for %u, sent: %u", - conn->seq_ack_from_tap, conn->seq_to_tap); + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; } @@ -2392,8 +2387,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, tcp_tap_window_update(conn, max_ack_seq_wnd); if (retr) { - trace("TCP: fast re-transmit, ACK: %u, previous sequence: %u", - max_ack_seq, conn->seq_to_tap); + flow_trace(conn, + "fast re-transmit, ACK: %u, previous sequence: %u", + max_ack_seq, conn->seq_to_tap); conn->seq_ack_from_tap = max_ack_seq; conn->seq_to_tap = max_ack_seq; tcp_data_from_sock(c, conn); @@ -2542,7 +2538,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, return 1; } - trace("TCP: packet length %lu from tap for index %u", len, FLOW_IDX(conn)); + flow_trace(conn, "packet length %lu from tap", len); if (th->rst) { conn_event(c, conn, CLOSED); @@ -2782,17 +2778,16 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) tcp_timer_ctl(c, conn); } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) { - debug("TCP: index %u, handshake timeout", FLOW_IDX(conn)); + flow_dbg(conn, "handshake timeout"); tcp_rst(c, conn); } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { - debug("TCP: index %u, FIN timeout", FLOW_IDX(conn)); + flow_dbg(conn, "FIN timeout"); tcp_rst(c, conn); } else if (conn->retrans == TCP_MAX_RETRANS) { - debug("TCP: index %u, retransmissions count exceeded", - FLOW_IDX(conn)); + flow_dbg(conn, "retransmissions count exceeded"); tcp_rst(c, conn); } else { - debug("TCP: index %u, ACK timeout, retry", FLOW_IDX(conn)); + flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; tcp_data_from_sock(c, conn); @@ -2810,7 +2805,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) */ timerfd_settime(conn->timer, 0, &new, &old); if (old.it_value.tv_sec == ACT_TIMEOUT) { - debug("TCP: index %u, activity timeout", FLOW_IDX(conn)); + flow_dbg(conn, "activity timeout"); tcp_rst(c, conn); } } diff --git a/tcp_splice.c b/tcp_splice.c index 212fe16..28b91e1 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -139,8 +139,7 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, if (epoll_ctl(c->epollfd, m, conn->s[0], &ev[0]) || epoll_ctl(c->epollfd, m, conn->s[1], &ev[1])) { int ret = -errno; - err("TCP (spliced): index %u, ERROR on epoll_ctl(): %s", - FLOW_IDX(conn), strerror(errno)); + flow_err(conn, "ERROR on epoll_ctl(): %s", strerror(errno)); return ret; } @@ -165,10 +164,9 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, return; conn->flags &= flag; - if (flag_index >= 0) { - debug("TCP (spliced): index %u: %s dropped", FLOW_IDX(conn), - tcp_splice_flag_str[flag_index]); - } + if (flag_index >= 0) + flow_dbg(conn, "%s dropped", + tcp_splice_flag_str[flag_index]); } else { int flag_index = fls(flag); @@ -176,10 +174,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, return; conn->flags |= flag; - if (flag_index >= 0) { - debug("TCP (spliced): index %u: %s", FLOW_IDX(conn), - tcp_splice_flag_str[flag_index]); - } + if (flag_index >= 0) + flow_dbg(conn, "%s", tcp_splice_flag_str[flag_index]); } if (flag == CLOSING) { @@ -190,8 +186,7 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn, #define conn_flag(c, conn, flag) \ do { \ - trace("TCP (spliced): flag at %s:%i", \ - __func__, __LINE__); \ + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \ conn_flag_do(c, conn, flag); \ } while (0) @@ -211,10 +206,8 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, return; conn->events &= event; - if (flag_index >= 0) { - debug("TCP (spliced): index %u, ~%s", FLOW_IDX(conn), - tcp_splice_event_str[flag_index]); - } + if (flag_index >= 0) + flow_dbg(conn, "~%s", tcp_splice_event_str[flag_index]); } else { int flag_index = fls(event); @@ -222,10 +215,8 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, return; conn->events |= event; - if (flag_index >= 0) { - debug("TCP (spliced): index %u, %s", FLOW_IDX(conn), - tcp_splice_event_str[flag_index]); - } + if (flag_index >= 0) + flow_dbg(conn, "%s", tcp_splice_event_str[flag_index]); } if (tcp_splice_epoll_ctl(c, conn)) @@ -234,8 +225,7 @@ static void conn_event_do(const struct ctx *c, struct tcp_splice_conn *conn, #define conn_event(c, conn, event) \ do { \ - trace("TCP (spliced): event at %s:%i", \ - __func__, __LINE__); \ + flow_trace(conn, "event at %s:%i",__func__, __LINE__); \ conn_event_do(c, conn, event); \ } while (0) @@ -281,7 +271,7 @@ void tcp_splice_destroy(struct ctx *c, union flow *flow) conn->events = SPLICE_CLOSED; conn->flags = 0; - debug("TCP (spliced): index %u, CLOSED", FLOW_IDX(conn)); + flow_dbg(conn, "CLOSED"); flow_table_compact(c, flow); } @@ -314,16 +304,17 @@ static int tcp_splice_connect_finish(const struct ctx *c, if (conn->pipe[side][0] < 0) { if (pipe2(conn->pipe[side], O_NONBLOCK | O_CLOEXEC)) { - err("TCP (spliced): cannot create %d->%d pipe: %s", - side, !side, strerror(errno)); + flow_err(conn, "cannot create %d->%d pipe: %s", + side, !side, strerror(errno)); conn_flag(c, conn, CLOSING); return -EIO; } if (fcntl(conn->pipe[side][0], F_SETPIPE_SZ, c->tcp.pipe_size)) { - trace("TCP (spliced): cannot set %d->%d pipe size to %lu", - side, !side, c->tcp.pipe_size); + flow_trace(conn, + "cannot set %d->%d pipe size to %lu", + side, !side, c->tcp.pipe_size); } } } @@ -363,8 +354,8 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, if (setsockopt(conn->s[1], SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) { - trace("TCP (spliced): failed to set TCP_QUICKACK on socket %i", - conn->s[1]); + flow_trace(conn, "failed to set TCP_QUICKACK on socket %i", + conn->s[1]); } if (CONN_V6(conn)) { @@ -475,7 +466,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, } if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) - trace("TCP (spliced): failed to set TCP_QUICKACK on %i", s); + flow_trace(conn, "failed to set TCP_QUICKACK on %i", s); conn->f.type = FLOW_TCP_SPLICE; conn->s[0] = s; @@ -555,7 +546,7 @@ retry: readlen = splice(conn->s[fromside], NULL, conn->pipe[fromside][1], NULL, c->tcp.pipe_size, SPLICE_F_MOVE | SPLICE_F_NONBLOCK); - trace("TCP (spliced): %li from read-side call", readlen); + flow_trace(conn, "%li from read-side call", readlen); if (readlen < 0) { if (errno == EINTR) goto retry; @@ -581,8 +572,8 @@ eintr: written = splice(conn->pipe[fromside][0], NULL, conn->s[!fromside], NULL, to_write, SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); - trace("TCP (spliced): %li from write-side call (passed %lu)", - written, to_write); + flow_trace(conn, "%li from write-side call (passed %lu)", + written, to_write); /* Most common case: skip updating counters. */ if (readlen > 0 && readlen == written) { @@ -794,8 +785,8 @@ void tcp_splice_timer(struct ctx *c, union flow *flow) if ((conn->flags & set) && !(conn->flags & act)) { if (setsockopt(conn->s[side], SOL_SOCKET, SO_RCVLOWAT, &((int){ 1 }), sizeof(int))) { - trace("TCP (spliced): can't set SO_RCVLOWAT on " - "%i", conn->s[side]); + flow_trace(conn, "can't set SO_RCVLOWAT on %d", + conn->s[side]); } conn_flag(c, conn, ~set); } -- 2.43.0

David Gibson

12:33 a.m.

New subject: [PATCH v2 07/11] flow: Introduce 'sidx' type to represent one side of one flow

In a number of places, we use indices into the flow table to identify a specific flow. We also have cases where we need to identify a particular side of a particular flow, and we expect those to become more common as we generalise the flow table to cover more things. To assist with that, introduces flow_sidx_t, an index type which identifies a specific side of a specific flow in the table. Signed-off-by: David Gibson --- flow.h | 13 +++++++++++++ flow_table.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/flow.h b/flow.h index b6da516..3c90bbd 100644 --- a/flow.h +++ b/flow.h @@ -39,6 +39,19 @@ struct flow_common { #define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ #define FLOW_FILE_PRESSURE 30 /* % of c->nofile */ +/** + * struct flow_sidx - ID for one side of a specific flow + * @side: Side referenced (0 or 1) + * @flow: Index of flow referenced + */ +typedef struct flow_sidx { + int side :1; + unsigned flow :FLOW_INDEX_BITS; +} flow_sidx_t; +static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t)); + +#define FLOW_SIDX_NONE ((flow_sidx_t){ .flow = FLOW_MAX }) + union flow; void flow_table_compact(struct ctx *c, union flow *hole); diff --git a/flow_table.h b/flow_table.h index 5e897bd..3c68d4a 100644 --- a/flow_table.h +++ b/flow_table.h @@ -47,4 +47,40 @@ static inline unsigned flow_idx(const struct flow_common *f) */ #define FLOW(idx) (&flowtab[(idx)]) +/** flow_at_sidx - Flow entry for a given sidx + * @sidx: Flow & side index + * + * Return: pointer to the corresponding flow entry, or NULL + */ +static inline union flow *flow_at_sidx(flow_sidx_t sidx) +{ + if (sidx.flow >= FLOW_MAX) + return NULL; + return FLOW(sidx.flow); +} + +/** flow_sidx_t - Index of one side of a flow from common structure + * @f: Common flow fields pointer + * @side: Which side to refer to (0 or 1) + * + * Return: index of @f and @side in the flow table + */ +static inline flow_sidx_t flow_sidx(const struct flow_common *f, + int side) +{ + ASSERT(side == !!side); + return (flow_sidx_t){ + .side = side, + .flow = flow_idx(f), + }; +} + +/** FLOW_SIDX - Find the index of one side of a flow + * @f_: Flow pointer, either union flow * or protocol specific + * @side: Which side to index (0 or 1) + * + * Return: index of @f and @side in the flow table + */ +#define FLOW_SIDX(f_, side) (flow_sidx(&(f_)->f, (side))) + #endif /* FLOW_TABLE_H */ -- 2.43.0

Stefano Brivio

29 Nov 29 Nov

3:32 p.m.

New subject: [PATCH v2 07/11] flow: Introduce 'sidx' type to represent one side of one flow

On Mon, 27 Nov 2023 10:33:44 +1100 David Gibson wrote:

...

In a number of places, we use indices into the flow table to identify a specific flow. We also have cases where we need to identify a particular side of a particular flow, and we expect those to become more common as we generalise the flow table to cover more things.

To assist with that, introduces flow_sidx_t, an index type which identifies a specific side of a specific flow in the table.

Signed-off-by: David Gibson --- flow.h | 13 +++++++++++++ flow_table.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+)

diff --git a/flow.h b/flow.h index b6da516..3c90bbd 100644 --- a/flow.h +++ b/flow.h @@ -39,6 +39,19 @@ struct flow_common { #define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ #define FLOW_FILE_PRESSURE 30 /* % of c->nofile */

+/** + * struct flow_sidx - ID for one side of a specific flow + * @side: Side referenced (0 or 1) + * @flow: Index of flow referenced + */ +typedef struct flow_sidx {

Implying my usual argument :) ...is there any advantage over using this simply as a struct?

...

+ int side :1; + unsigned flow :FLOW_INDEX_BITS; +} flow_sidx_t;

-- Stefano

David Gibson

30 Nov 30 Nov

1:37 a.m.

New subject: [PATCH v2 07/11] flow: Introduce 'sidx' type to represent one side of one flow

On Wed, Nov 29, 2023 at 03:32:32PM +0100, Stefano Brivio wrote:

...

On Mon, 27 Nov 2023 10:33:44 +1100 David Gibson wrote:

...
In a number of places, we use indices into the flow table to identify a specific flow. We also have cases where we need to identify a particular side of a particular flow, and we expect those to become more common as we generalise the flow table to cover more things.

To assist with that, introduces flow_sidx_t, an index type which identifies a specific side of a specific flow in the table.

Signed-off-by: David Gibson --- flow.h | 13 +++++++++++++ flow_table.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+)

diff --git a/flow.h b/flow.h index b6da516..3c90bbd 100644 --- a/flow.h +++ b/flow.h @@ -39,6 +39,19 @@ struct flow_common { #define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ #define FLOW_FILE_PRESSURE 30 /* % of c->nofile */

+/** + * struct flow_sidx - ID for one side of a specific flow + * @side: Side referenced (0 or 1) + * @flow: Index of flow referenced + */ +typedef struct flow_sidx {

Implying my usual argument :) ...is there any advantage over using this simply as a struct?

So, usually I too would prefer to use a struct as a struct, without a typedef. The reason I'm doing differently here, is that I want to emphasise that for many purposes this can be treated like an index, in particular that it's small and trivially copyable. In particular it should be passed by value, passing by reference would be silly. That's kind of the opposite of what one tends to be conveying by reminding users that they're working with a struct.

...

...
+ int side :1; + unsigned flow :FLOW_INDEX_BITS; +} flow_sidx_t;

-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson

Stefano Brivio

10:21 a.m.

New subject: [PATCH v2 07/11] flow: Introduce 'sidx' type to represent one side of one flow

On Thu, 30 Nov 2023 11:37:40 +1100 David Gibson wrote:

...

On Wed, Nov 29, 2023 at 03:32:32PM +0100, Stefano Brivio wrote:

...
On Mon, 27 Nov 2023 10:33:44 +1100 David Gibson wrote:

...
In a number of places, we use indices into the flow table to identify a specific flow. We also have cases where we need to identify a particular side of a particular flow, and we expect those to become more common as we generalise the flow table to cover more things.

To assist with that, introduces flow_sidx_t, an index type which identifies a specific side of a specific flow in the table.

Signed-off-by: David Gibson --- flow.h | 13 +++++++++++++ flow_table.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+)

diff --git a/flow.h b/flow.h index b6da516..3c90bbd 100644 --- a/flow.h +++ b/flow.h @@ -39,6 +39,19 @@ struct flow_common { #define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ #define FLOW_FILE_PRESSURE 30 /* % of c->nofile */

+/** + * struct flow_sidx - ID for one side of a specific flow + * @side: Side referenced (0 or 1) + * @flow: Index of flow referenced + */ +typedef struct flow_sidx {

Implying my usual argument :) ...is there any advantage over using this simply as a struct?

So, usually I too would prefer to use a struct as a struct, without a typedef. The reason I'm doing differently here, is that I want to emphasise that for many purposes this can be treated like an index, in particular that it's small and trivially copyable. In particular it should be passed by value, passing by reference would be silly.

Hmm, that was exactly my "not hiding" point though. The day somebody adds here: char mood[RLIMIT_STACK_VAL + 1]; /* list of side emojis */ the typedef makes it still apparently okay to pass by value. If it's a struct, one surely has to check first.

...

That's kind of the opposite of what one tends to be conveying by reminding users that they're working with a struct.

I see, but it's probably a matter of taste (passing structs by value doesn't personally make me nervous). -- Stefano

David Gibson

1 Dec 1 Dec

1:10 a.m.

New subject: [PATCH v2 07/11] flow: Introduce 'sidx' type to represent one side of one flow

On Thu, Nov 30, 2023 at 10:21:16AM +0100, Stefano Brivio wrote:

...

On Thu, 30 Nov 2023 11:37:40 +1100 David Gibson wrote:

...
On Wed, Nov 29, 2023 at 03:32:32PM +0100, Stefano Brivio wrote:

...
On Mon, 27 Nov 2023 10:33:44 +1100 David Gibson wrote:

...
In a number of places, we use indices into the flow table to identify a specific flow. We also have cases where we need to identify a particular side of a particular flow, and we expect those to become more common as we generalise the flow table to cover more things.

To assist with that, introduces flow_sidx_t, an index type which identifies a specific side of a specific flow in the table.

Signed-off-by: David Gibson --- flow.h | 13 +++++++++++++ flow_table.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+)

diff --git a/flow.h b/flow.h index b6da516..3c90bbd 100644 --- a/flow.h +++ b/flow.h @@ -39,6 +39,19 @@ struct flow_common { #define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ #define FLOW_FILE_PRESSURE 30 /* % of c->nofile */

+/** + * struct flow_sidx - ID for one side of a specific flow + * @side: Side referenced (0 or 1) + * @flow: Index of flow referenced + */ +typedef struct flow_sidx {

Implying my usual argument :) ...is there any advantage over using this simply as a struct?

So, usually I too would prefer to use a struct as a struct, without a typedef. The reason I'm doing differently here, is that I want to emphasise that for many purposes this can be treated like an index, in particular that it's small and trivially copyable. In particular it should be passed by value, passing by reference would be silly.

Hmm, that was exactly my "not hiding" point though. The day somebody adds here:

char mood[RLIMIT_STACK_VAL + 1]; /* list of side emojis */

the typedef makes it still apparently okay to pass by value. If it's a struct, one surely has to check first.

Right.. and that's kind of the point. If someone adds that, this struct is no longer doing the job intended for it, and a wider redesign is needed. That's why there's also the static_assert() verifying that flow_sidx_t fits in a u32.

...

...
That's kind of the opposite of what one tends to be conveying by reminding users that they're working with a struct.

I see, but it's probably a matter of taste (passing structs by value doesn't personally make me nervous).

-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson

David Gibson

27 Nov 27 Nov

12:33 a.m.

New subject: [PATCH v2 08/11] tcp: Remove unneccessary bounds check in tcp_timer_handler()

In tcp_timer_handler() we use conn_at_idx() to interpret the flow index from the epoll reference. However, this will never be NULL - we always put a valid index into the epoll_ref. Simplify slightly based on this. Signed-off-by: David Gibson --- tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index f427047..512baf3 100644 --- a/tcp.c +++ b/tcp.c @@ -2759,10 +2759,10 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, */ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) { - struct tcp_tap_conn *conn = conn_at_idx(ref.tcp.index); struct itimerspec check_armed = { { 0 }, { 0 } }; + struct tcp_tap_conn *conn = CONN(ref.tcp.index); - if (c->no_tcp || !conn) + if (c->no_tcp) return; /* We don't reset timers on ~ACK_FROM_TAP_DUE, ~ACK_TO_TAP_DUE. If the -- 2.43.0

Stefano Brivio

29 Nov 29 Nov

3:32 p.m.

New subject: [PATCH v2 08/11] tcp: Remove unneccessary bounds check in tcp_timer_handler()

On Mon, 27 Nov 2023 10:33:45 +1100 David Gibson wrote:

...

In tcp_timer_handler() we use conn_at_idx() to interpret the flow index from the epoll reference. However, this will never be NULL - we always put a valid index into the epoll_ref. Simplify slightly based on this.

Sorry, I missed this during review of v1. I have mixed feeling about this, and I don't think 11/11 changes anything in this regard: we have to trust the kernel, as there's no benefit to security in not doing so. At the same time, should we ever get an out-of-bounds index from the epoll event, we can fail gracefully here. Slightly worse, however: if we get a timer event for a connection that's already closed, we'll happily go and try to manipulate its timer (with or without the !conn check). All in all, I think the check is minimally useful, and we should have something more robust than that. So if this patch helps keeping things simple later in the series, I don't see an issue with that, but perhaps we should add back a more sensible set of checks later. The next patches all look good to me. -- Stefano

David Gibson

30 Nov 30 Nov

1:42 a.m.

New subject: [PATCH v2 08/11] tcp: Remove unneccessary bounds check in tcp_timer_handler()

On Wed, Nov 29, 2023 at 03:32:39PM +0100, Stefano Brivio wrote:

...

On Mon, 27 Nov 2023 10:33:45 +1100 David Gibson wrote:

...
In tcp_timer_handler() we use conn_at_idx() to interpret the flow index from the epoll reference. However, this will never be NULL - we always put a valid index into the epoll_ref. Simplify slightly based on this.

Sorry, I missed this during review of v1.

I have mixed feeling about this, and I don't think 11/11 changes anything in this regard: we have to trust the kernel, as there's no benefit to security in not doing so.

At the same time, should we ever get an out-of-bounds index from the epoll event, we can fail gracefully here. Slightly worse, however: if we get a timer event for a connection that's already closed, we'll happily go and try to manipulate its timer (with or without the !conn check).

So, as you note this only verifies that the index is theoretically possible. It doesn't check that it's a valid index in the current size of the connection table, it doesn't check if the connection is already closed and it can't check if it's a stale index for a different connection than the one originally intended, because the table has been compacted. Given all those potential failure modes, I don't see a lot of value in checking if the index is wildly out of bounds, which would require a kernel bug rather more extreme than those other possibilies.

...

All in all, I think the check is minimally useful, and we should have something more robust than that. So if this patch helps keeping things simple later in the series, I don't see an issue with that, but perhaps we should add back a more sensible set of checks later.

Perhaps, yes.

...

The next patches all look good to me.

-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson

David Gibson

27 Nov 27 Nov

12:33 a.m.

New subject: [PATCH v2 09/11] flow,tcp: Generalise TCP epoll_ref to generic flows

TCP uses three different epoll object types: one for connected sockets, one for timers and one for listening sockets. Listening sockets really need information that's specific to TCP, so need their own epoll_ref field. Timers and connected sockets, however, only need the connection (flow) they're associated with. As we expand the use of the flow table, we expect that to be true for more epoll fds. So, rename the "TCP" epoll_ref field to be a "flow" epoll_ref field that can be used both for TCP and for other future cases. Signed-off-by: David Gibson --- passt.h | 6 +++--- tcp.c | 10 +++++----- tcp_splice.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/passt.h b/passt.h index 0fce637..66a819f 100644 --- a/passt.h +++ b/passt.h @@ -76,8 +76,8 @@ enum epoll_type { * union epoll_ref - Breakdown of reference for epoll fd bookkeeping * @type: Type of fd (tells us what to do with events) * @fd: File descriptor number (implies < 2^24 total descriptors) - * @tcp: TCP-specific reference part (connected sockets) - * @tcp_listen: TCP-specific reference part (listening sockets) + * @flow: Index of the flow this fd is linked to + * @tcp_listen: TCP-specific reference part for listening sockets * @udp: UDP-specific reference part * @icmp: ICMP-specific reference part * @data: Data handled by protocol handlers @@ -90,7 +90,7 @@ union epoll_ref { #define FD_REF_MAX ((int)MAX_FROM_BITS(FD_REF_BITS)) int32_t fd:FD_REF_BITS; union { - union tcp_epoll_ref tcp; + uint32_t flow; union tcp_listen_epoll_ref tcp_listen; union udp_epoll_ref udp; union icmp_epoll_ref icmp; diff --git a/tcp.c b/tcp.c index 512baf3..e80a6aa 100644 --- a/tcp.c +++ b/tcp.c @@ -639,7 +639,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .tcp.index = FLOW_IDX(conn) }; + .flow = FLOW_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { @@ -660,7 +660,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->timer != -1) { union epoll_ref ref_t = { .type = EPOLL_TYPE_TCP_TIMER, .fd = conn->sock, - .tcp.index = FLOW_IDX(conn) }; + .flow = FLOW_IDX(conn) }; struct epoll_event ev_t = { .data.u64 = ref_t.u64, .events = EPOLLIN | EPOLLET }; @@ -688,7 +688,7 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn) if (conn->timer == -1) { union epoll_ref ref = { .type = EPOLL_TYPE_TCP_TIMER, .fd = conn->sock, - .tcp.index = FLOW_IDX(conn) }; + .flow = FLOW_IDX(conn) }; struct epoll_event ev = { .data.u64 = ref.u64, .events = EPOLLIN | EPOLLET }; int fd; @@ -2760,7 +2760,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, void tcp_timer_handler(struct ctx *c, union epoll_ref ref) { struct itimerspec check_armed = { { 0 }, { 0 } }; - struct tcp_tap_conn *conn = CONN(ref.tcp.index); + struct tcp_tap_conn *conn = CONN(ref.flow); if (c->no_tcp) return; @@ -2874,7 +2874,7 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, */ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) { - union flow *flow = flowtab + ref.tcp.index; + union flow *flow = FLOW(ref.flow); switch (flow->f.type) { case FLOW_TCP: diff --git a/tcp_splice.c b/tcp_splice.c index 28b91e1..5ebc4e5 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -128,8 +128,8 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref[SIDES] = { - { .type = EPOLL_TYPE_TCP, .fd = conn->s[0], .tcp.index = FLOW_IDX(conn) }, - { .type = EPOLL_TYPE_TCP, .fd = conn->s[1], .tcp.index = FLOW_IDX(conn) } + { .type = EPOLL_TYPE_TCP, .fd = conn->s[0], .flow = FLOW_IDX(conn) }, + { .type = EPOLL_TYPE_TCP, .fd = conn->s[1], .flow = FLOW_IDX(conn) } }; struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, { .data.u64 = ref[1].u64 } }; -- 2.43.0

David Gibson

12:33 a.m.

New subject: [PATCH v2 10/11] tcp_splice: Use unsigned to represent side

Currently, we use 'int' values to represent the "side" of a connection, which must always be 0 or 1. This turns out to be dangerous. In some cases we're going to want to put the side into a 1-bit bitfield. However, if that bitfield has type 'int', when we copy it out to a regular 'int' variable, it will be sign-extended and so have values 0 and -1, instead of 0 and 1. To avoid this, always use unsigned variables for the side. Signed-off-by: David Gibson --- tcp_splice.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index 5ebc4e5..9cec9c6 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -249,7 +249,7 @@ void tcp_splice_conn_update(const struct ctx *c, struct tcp_splice_conn *new) void tcp_splice_destroy(struct ctx *c, union flow *flow) { struct tcp_splice_conn *conn = &flow->tcp_splice; - int side; + unsigned side; for (side = 0; side < SIDES; side++) { if (conn->events & SPLICE_ESTABLISHED) { @@ -286,8 +286,8 @@ void tcp_splice_destroy(struct ctx *c, union flow *flow) static int tcp_splice_connect_finish(const struct ctx *c, struct tcp_splice_conn *conn) { + unsigned side; int i = 0; - int side; for (side = 0; side < SIDES; side++) { conn->pipe[side][0] = conn->pipe[side][1] = -1; @@ -490,7 +490,8 @@ void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, int s, uint32_t events) { uint8_t lowat_set_flag, lowat_act_flag; - int fromside, eof, never_read; + int eof, never_read; + unsigned fromside; if (conn->events == SPLICE_CLOSED) return; -- 2.43.0

David Gibson

12:33 a.m.

New subject: [PATCH v2 11/11] flow,tcp: Use epoll_ref type including flow and side

Currently TCP uses the 'flow' epoll_ref field for both connected sockets and timers, which consists of just the index of the relevant flow (connection). This is just fine for timers, for while it obviously works, it's subtly incomplete for sockets on spliced connections. In that case we want to know which side of the connection the event is occurring on as well as which connection. At present, we deduce that information by looking at the actual fd, and comparing it to the fds of the sockets on each side. When we use the flow table for more things, we expect more cases where something will need to know a specific side of a specific flow for an event, but nothing more. Therefore add a new 'flowside' epoll_ref field, with exactly that information. We use it for TCP connected sockets. This allows us to directly know the side for spliced connections. For "tap" connections, it's pretty meaningless, since the side is always the socket side. It still makes logical sense though, and it may become important for future flow table work. Signed-off-by: David Gibson --- flow.h | 2 +- passt.h | 2 ++ tcp.c | 11 ++++++++--- tcp_splice.c | 37 ++++++++++++------------------------- tcp_splice.h | 2 +- 5 files changed, 24 insertions(+), 30 deletions(-) diff --git a/flow.h b/flow.h index 3c90bbd..75e2115 100644 --- a/flow.h +++ b/flow.h @@ -45,7 +45,7 @@ struct flow_common { * @flow: Index of flow referenced */ typedef struct flow_sidx { - int side :1; + unsigned side :1; unsigned flow :FLOW_INDEX_BITS; } flow_sidx_t; static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t)); diff --git a/passt.h b/passt.h index 66a819f..33b493f 100644 --- a/passt.h +++ b/passt.h @@ -37,6 +37,7 @@ union epoll_ref; #include "pif.h" #include "packet.h" +#include "flow.h" #include "icmp.h" #include "port_fwd.h" #include "tcp.h" @@ -91,6 +92,7 @@ union epoll_ref { int32_t fd:FD_REF_BITS; union { uint32_t flow; + flow_sidx_t flowside; union tcp_listen_epoll_ref tcp_listen; union udp_epoll_ref udp; union icmp_epoll_ref icmp; diff --git a/tcp.c b/tcp.c index e80a6aa..24aba44 100644 --- a/tcp.c +++ b/tcp.c @@ -304,6 +304,10 @@ #include "tcp_conn.h" #include "flow_table.h" +/* Sides of a flow as we use them in "tap" connections */ +#define SOCKSIDE 0 +#define TAPSIDE 1 + #define TCP_FRAMES_MEM 128 #define TCP_FRAMES \ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) @@ -639,7 +643,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn) { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .type = EPOLL_TYPE_TCP, .fd = conn->sock, - .flow = FLOW_IDX(conn) }; + .flowside = FLOW_SIDX(conn, SOCKSIDE) }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { @@ -2874,14 +2878,15 @@ static void tcp_tap_sock_handler(struct ctx *c, struct tcp_tap_conn *conn, */ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) { - union flow *flow = FLOW(ref.flow); + union flow *flow = FLOW(ref.flowside.flow); switch (flow->f.type) { case FLOW_TCP: tcp_tap_sock_handler(c, &flow->tcp, events); break; case FLOW_TCP_SPLICE: - tcp_splice_sock_handler(c, &flow->tcp_splice, ref.fd, events); + tcp_splice_sock_handler(c, &flow->tcp_splice, ref.flowside.side, + events); break; default: die("Unexpected %s in tcp_sock_handler_compact()", diff --git a/tcp_splice.c b/tcp_splice.c index 9cec9c6..d5c351b 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -128,8 +128,10 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, { int m = conn->in_epoll ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref[SIDES] = { - { .type = EPOLL_TYPE_TCP, .fd = conn->s[0], .flow = FLOW_IDX(conn) }, - { .type = EPOLL_TYPE_TCP, .fd = conn->s[1], .flow = FLOW_IDX(conn) } + { .type = EPOLL_TYPE_TCP, .fd = conn->s[0], + .flowside = FLOW_SIDX(conn, 0) }, + { .type = EPOLL_TYPE_TCP, .fd = conn->s[1], + .flowside = FLOW_SIDX(conn, 1) } }; struct epoll_event ev[SIDES] = { { .data.u64 = ref[0].u64 }, { .data.u64 = ref[1].u64 } }; @@ -481,13 +483,13 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, * tcp_splice_sock_handler() - Handler for socket mapped to spliced connection * @c: Execution context * @conn: Connection state - * @s: Socket fd on which an event has occurred + * @side: Side of the connection on which an event has occurred * @events: epoll events bitmap * * #syscalls:pasta splice */ void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, - int s, uint32_t events) + int side, uint32_t events) { uint8_t lowat_set_flag, lowat_act_flag; int eof, never_read; @@ -507,30 +509,15 @@ void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, } if (events & EPOLLOUT) { - if (s == conn->s[0]) { - conn_event(c, conn, ~OUT_WAIT_0); - fromside = 1; - } else { - conn_event(c, conn, ~OUT_WAIT_1); - fromside = 0; - } + fromside = !side; + conn_event(c, conn, side == 0 ? ~OUT_WAIT_0 : ~OUT_WAIT_1); } else { - fromside = s == conn->s[0] ? 0 : 1; - } - - if (events & EPOLLRDHUP) { - if (s == conn->s[0]) - conn_event(c, conn, FIN_RCVD_0); - else - conn_event(c, conn, FIN_RCVD_1); + fromside = side; } - if (events & EPOLLHUP) { - if (s == conn->s[0]) - conn_event(c, conn, FIN_SENT_0); /* Fake, but implied */ - else - conn_event(c, conn, FIN_SENT_1); - } + if (events & EPOLLRDHUP) + /* For side 0 this is fake, but implied */ + conn_event(c, conn, side == 0 ? FIN_RCVD_0 : FIN_RCVD_1); swap: eof = 0; diff --git a/tcp_splice.h b/tcp_splice.h index dc486f1..aa85c7c 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -9,7 +9,7 @@ struct tcp_splice_conn; void tcp_splice_sock_handler(struct ctx *c, struct tcp_splice_conn *conn, - int s, uint32_t events); + int side, uint32_t events); bool tcp_splice_conn_from_sock(const struct ctx *c, union tcp_listen_epoll_ref ref, struct tcp_splice_conn *conn, int s, -- 2.43.0

624

Age (days ago)

629

Last active (days ago)

List overview

Download

17 comments

2 participants

participants (2)

David Gibson
Stefano Brivio

[PATCH v2 00/11] Introduce unified flow table, first steps

tags

participants (2)