This is a bit of a diversion from what I'm notionally working on at the moment. While thinking about what we'd need to use the IP_TRANSPARENT socket option to broaden the cases where we can "splice", I noticed some inelegancies in how we handle the pool of pre-opened sockets in the TCP code, and well, here we are. This makes a number of cleanups to the handling of these pools. Most notably, tcp_splice_connect() now has simpler semantics: it now always runs in the init namespace, and is always given a pre-opened socket (which could be in the guest ns). Changes since v1: * Rebased * Improved wording of some commit messages David Gibson (5): tcp: Make a helper to refill each socket pool tcp: Split init and ns cases for tcp_sock_refill() tcp: Move socket pool declarations around tcp: Split pool lookup from creating new sockets in tcp_conn_new_sock() tcp: Improve handling of fallback if socket pool is empty on new splice tcp.c | 138 ++++++++++++++++++------------------------------- tcp.h | 2 - tcp_conn.h | 12 ++++- tcp_splice.c | 141 ++++++++++++++++++++++++++------------------------- 4 files changed, 132 insertions(+), 161 deletions(-) -- 2.39.1
tcp_sock_refill() contains two near-identical loops to refill the IPv4 and IPv6 socket pools. In addition, if we get an error on the IPv4 pool we exit early and won't attempt to refill the IPv6 pool. At least theoretically, these are independent from each other and there's value to filling up either pool without the other. So, there's no strong reason to give up on one because the other failed. Address both of these with a helper function 'tcp_sock_refill_pool()' to refill a single given pool. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 63 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/tcp.c b/tcp.c index 8424d8e..5d13244 100644 --- a/tcp.c +++ b/tcp.c @@ -3009,6 +3009,34 @@ static int tcp_ns_socks_init(void *arg) return 0; } +/** + * tcp_sock_refill_pool() - Refill one pool of pre-opened sockets + * @c: Execution context + * @pool: Pool of sockets to refill + * @af: Address family to use + */ +static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af) +{ + int i; + + for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) { + int *s = &pool[i]; + + if (*s >= 0) + break; + + *s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (*s > SOCKET_MAX) { + close(*s); + *s = -1; + return; + } + + if (*s >= 0) + tcp_sock_set_bufsize(c, *s); + } +} + /** * struct tcp_sock_refill_arg - Arguments for tcp_sock_refill() * @c: Execution context @@ -3028,7 +3056,7 @@ struct tcp_sock_refill_arg { static int tcp_sock_refill(void *arg) { struct tcp_sock_refill_arg *a = (struct tcp_sock_refill_arg *)arg; - int i, *p4, *p6; + int *p4, *p6; if (a->ns) { ns_enter(a->c); @@ -3039,36 +3067,11 @@ static int tcp_sock_refill(void *arg) p6 = init_sock_pool6; } - for (i = 0; a->c->ifi4 && i < TCP_SOCK_POOL_SIZE; i++, p4++) { - if (*p4 >= 0) - break; - - *p4 = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); - if (*p4 > SOCKET_MAX) { - close(*p4); - *p4 = -1; - return -EIO; - } - - if (*p4 >= 0) - tcp_sock_set_bufsize(a->c, *p4); - } - - for (i = 0; a->c->ifi6 && i < TCP_SOCK_POOL_SIZE; i++, p6++) { - if (*p6 >= 0) - break; - - *p6 = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, - IPPROTO_TCP); - if (*p6 > SOCKET_MAX) { - close(*p6); - *p6 = -1; - return -EIO; - } + if (a->c->ifi4) + tcp_sock_refill_pool(a->c, p4, AF_INET); - if (*p6 >= 0) - tcp_sock_set_bufsize(a->c, *p6); - } + if (a->c->ifi6) + tcp_sock_refill_pool(a->c, p6, AF_INET6); return 0; } -- 2.39.1
With the creation of the tcp_sock_refill_pool() helper, the ns==true and ns==false paths for tcp_sock_refill() now have almost nothing in common. Split the two versions into tcp_sock_refill_init() and tcp_sock_refill_ns() functions. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 53 +++++++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/tcp.c b/tcp.c index 5d13244..1d1b4ee 100644 --- a/tcp.c +++ b/tcp.c @@ -3038,40 +3038,33 @@ static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af) } /** - * struct tcp_sock_refill_arg - Arguments for tcp_sock_refill() + * tcp_sock_refill_init() - Refill pools of pre-opened sockets in init ns * @c: Execution context - * @ns: Set to refill pool of sockets created in namespace */ -struct tcp_sock_refill_arg { - struct ctx *c; - int ns; -}; +static void tcp_sock_refill_init(const struct ctx *c) +{ + if (c->ifi4) + tcp_sock_refill_pool(c, init_sock_pool4, AF_INET); + if (c->ifi6) + tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); +} /** - * tcp_sock_refill() - Refill pool of pre-opened sockets - * @arg: See @tcp_sock_refill_arg + * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace + * @arg: Execution context cast to void * * * Return: 0 */ -static int tcp_sock_refill(void *arg) +static int tcp_sock_refill_ns(void *arg) { - struct tcp_sock_refill_arg *a = (struct tcp_sock_refill_arg *)arg; - int *p4, *p6; + const struct ctx *c = (const struct ctx *)arg; - if (a->ns) { - ns_enter(a->c); - p4 = ns_sock_pool4; - p6 = ns_sock_pool6; - } else { - p4 = init_sock_pool4; - p6 = init_sock_pool6; - } - - if (a->c->ifi4) - tcp_sock_refill_pool(a->c, p4, AF_INET); + ns_enter(c); - if (a->c->ifi6) - tcp_sock_refill_pool(a->c, p6, AF_INET6); + if (c->ifi4) + tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); + if (c->ifi6) + tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); return 0; } @@ -3084,7 +3077,6 @@ static int tcp_sock_refill(void *arg) */ int tcp_init(struct ctx *c) { - struct tcp_sock_refill_arg refill_arg = { c, 0 }; int i; #ifndef HAS_GETRANDOM int dev_random = open("/dev/random", O_RDONLY); @@ -3130,15 +3122,14 @@ int tcp_init(struct ctx *c) memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); - tcp_sock_refill(&refill_arg); + tcp_sock_refill_init(c); if (c->mode == MODE_PASTA) { tcp_splice_init(c); NS_CALL(tcp_ns_socks_init, c); - refill_arg.ns = 1; - NS_CALL(tcp_sock_refill, &refill_arg); + NS_CALL(tcp_sock_refill_ns, c); } return 0; @@ -3258,7 +3249,6 @@ static int tcp_port_rebind(void *arg) */ void tcp_timer(struct ctx *c, const struct timespec *ts) { - struct tcp_sock_refill_arg refill_arg = { c, 0 }; union tcp_conn *conn; (void)ts; @@ -3291,12 +3281,11 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) } } - tcp_sock_refill(&refill_arg); + tcp_sock_refill_init(c); if (c->mode == MODE_PASTA) { - refill_arg.ns = 1; if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) || (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0)) - NS_CALL(tcp_sock_refill, &refill_arg); + NS_CALL(tcp_sock_refill_ns, c); tcp_splice_pipe_refill(c); } -- 2.39.1
tcp_splice.c has some explicit extern declarations to access the socket pools. This is pretty dangerous - if we changed the type of these variables in tcp.c, we'd have tcp.c and tcp_splice.c using the same memory in different ways with no compiler error. So, move the extern declarations to tcp_conn.h so they're visible to both tcp.c and tcp_splice.c, but not the rest of pasta. In fact the pools for the guest namespace are necessarily only used by tcp_splice.c - we have no sockets on the guest side if we're not splicing. So move those declarations and the functions that deal exclusively with them to tcp_splice.c Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 41 ++++------------------------------------- tcp.h | 2 -- tcp_conn.h | 10 ++++++++-- tcp_splice.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 55 insertions(+), 48 deletions(-) diff --git a/tcp.c b/tcp.c index 1d1b4ee..2f40d62 100644 --- a/tcp.c +++ b/tcp.c @@ -369,8 +369,6 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ #define FIN_TIMEOUT 60 #define ACT_TIMEOUT 7200 -#define TCP_SOCK_POOL_TSH 16 /* Refill in ns if > x used */ - #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ @@ -594,11 +592,9 @@ static inline struct tcp_tap_conn *conn_at_idx(int index) /* Table for lookup from remote address, local port, remote port */ static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE]; -/* Pools for pre-opened sockets */ +/* Pools for pre-opened sockets (in init) */ int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; -int ns_sock_pool4 [TCP_SOCK_POOL_SIZE]; -int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; /** * tcp_conn_epoll_events() - epoll events mask for given connection state @@ -3015,7 +3011,7 @@ static int tcp_ns_socks_init(void *arg) * @pool: Pool of sockets to refill * @af: Address family to use */ -static void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af) +void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af) { int i; @@ -3049,26 +3045,6 @@ static void tcp_sock_refill_init(const struct ctx *c) tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); } -/** - * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace - * @arg: Execution context cast to void * - * - * Return: 0 - */ -static int tcp_sock_refill_ns(void *arg) -{ - const struct ctx *c = (const struct ctx *)arg; - - ns_enter(c); - - if (c->ifi4) - tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); - if (c->ifi6) - tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); - - return 0; -} - /** * tcp_init() - Get initial sequence, hash secret, initialise per-socket data * @c: Execution context @@ -3117,8 +3093,6 @@ int tcp_init(struct ctx *c) memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); - memset(ns_sock_pool4, 0xff, sizeof(ns_sock_pool4)); - memset(ns_sock_pool6, 0xff, sizeof(ns_sock_pool6)); memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); @@ -3128,8 +3102,6 @@ int tcp_init(struct ctx *c) tcp_splice_init(c); NS_CALL(tcp_ns_socks_init, c); - - NS_CALL(tcp_sock_refill_ns, c); } return 0; @@ -3282,11 +3254,6 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) } tcp_sock_refill_init(c); - if (c->mode == MODE_PASTA) { - if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) || - (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0)) - NS_CALL(tcp_sock_refill_ns, c); - - tcp_splice_pipe_refill(c); - } + if (c->mode == MODE_PASTA) + tcp_splice_refill(c); } diff --git a/tcp.h b/tcp.h index 739b451..236038f 100644 --- a/tcp.h +++ b/tcp.h @@ -11,8 +11,6 @@ #define TCP_CONN_INDEX_BITS 17 /* 128k */ #define TCP_MAX_CONNS (1 << TCP_CONN_INDEX_BITS) -#define TCP_SOCK_POOL_SIZE 32 - struct ctx; void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, diff --git a/tcp_conn.h b/tcp_conn.h index 70f4a7c..9951b0a 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -182,11 +182,17 @@ union tcp_conn { /* TCP connections */ extern union tcp_conn tc[]; +/* Socket pools */ +#define TCP_SOCK_POOL_SIZE 32 + +extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; +extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; + void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new); void tcp_table_compact(struct ctx *c, union tcp_conn *hole); void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union); void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union); -void tcp_splice_pipe_refill(const struct ctx *c); - +void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af); +void tcp_splice_refill(const struct ctx *c); #endif /* TCP_CONN_H */ diff --git a/tcp_splice.c b/tcp_splice.c index 1d624f1..09f0e3e 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -60,11 +60,11 @@ #define TCP_SPLICE_CONN_PRESSURE 30 /* % of conn_count */ #define TCP_SPLICE_FILE_PRESSURE 30 /* % of c->nofile */ -/* From tcp.c */ -extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; -extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; -extern int ns_sock_pool4 [TCP_SOCK_POOL_SIZE]; -extern int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; +/* Pools for pre-opened sockets (in namespace) */ +#define TCP_SOCK_POOL_TSH 16 /* Refill in ns if > x used */ + +static int ns_sock_pool4 [TCP_SOCK_POOL_SIZE]; +static int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; /* Pool of pre-opened pipes */ static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; @@ -782,7 +782,7 @@ smaller: * tcp_splice_pipe_refill() - Refill pool of pre-opened pipes * @c: Execution context */ -void tcp_splice_pipe_refill(const struct ctx *c) +static void tcp_splice_pipe_refill(const struct ctx *c) { int i; @@ -811,6 +811,39 @@ void tcp_splice_pipe_refill(const struct ctx *c) } } +/** + * tcp_sock_refill_ns() - Refill pools of pre-opened sockets in namespace + * @arg: Execution context cast to void * + * + * Return: 0 + */ +static int tcp_sock_refill_ns(void *arg) +{ + const struct ctx *c = (const struct ctx *)arg; + + ns_enter(c); + + if (c->ifi4) + tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); + if (c->ifi6) + tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); + + return 0; +} + +/** + * tcp_splice_refill() - Refill pools of resources needed for splicing + * @c: Execution context + */ +void tcp_splice_refill(const struct ctx *c) +{ + if ((c->ifi4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) || + (c->ifi6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0)) + NS_CALL(tcp_sock_refill_ns, c); + + tcp_splice_pipe_refill(c); +} + /** * tcp_splice_init() - Initialise pipe pool and size * @c: Execution context @@ -819,7 +852,10 @@ void tcp_splice_init(struct ctx *c) { memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool)); tcp_set_pipe_size(c); - tcp_splice_pipe_refill(c); + + memset(&ns_sock_pool4, 0xff, sizeof(ns_sock_pool4)); + memset(&ns_sock_pool6, 0xff, sizeof(ns_sock_pool6)); + NS_CALL(tcp_sock_refill_ns, c); } /** -- 2.39.1
tcp_conn_new_sock() first looks for a socket in a pre-opened pool, then if that's empty creates a new socket in the init namespace. Both parts of this are duplicated in other places: the pool lookup logic is duplicated in tcp_splice_new(), and the socket opening logic is duplicated in tcp_sock_refill_pool(). Split the function into separate parts so we can remove both these duplications. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 53 +++++++++++++++++++++++++++------------------------- tcp_conn.h | 1 + tcp_splice.c | 8 ++------ 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/tcp.c b/tcp.c index 2f40d62..a0e2e34 100644 --- a/tcp.c +++ b/tcp.c @@ -1858,24 +1858,35 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, } /** - * tcp_conn_new_sock() - Get socket for new connection from pool or make new one - * @c: Execution context - * @af: Address family + * tcp_conn_pool_sock() - Get socket for new connection from pre-opened pool + * @pool: Pool of pre-opened sockets * - * Return: socket number if available, negative code if socket creation failed + * Return: socket number if available, negative code if pool is empty */ -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +int tcp_conn_pool_sock(int pool[]) { - int *p = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4, i, s = -1; + int s = -1, i; - for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) { - SWAP(s, *p); + for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) { + SWAP(s, pool[i]); if (s >= 0) - break; + return s; } + return -1; +} - if (s < 0) - s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); +/** + * tcp_conn_new_sock() - Open and prepare new socket for connection + * @c: Execution context + * @af: Address family + * + * Return: socket number on success, negative code if socket creation failed + */ +static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +{ + int s; + + s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); if (s > SOCKET_MAX) { close(s); @@ -1936,6 +1947,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) { + int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; struct sockaddr_in addr4 = { .sin_family = AF_INET, .sin_port = th->dest, @@ -1954,8 +1966,9 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, if (c->tcp.conn_count >= TCP_MAX_CONNS) return; - if ((s = tcp_conn_new_sock(c, af)) < 0) - return; + if ((s = tcp_conn_pool_sock(pool)) < 0) + if ((s = tcp_conn_new_sock(c, af)) < 0) + return; if (!c->no_map_gw) { if (af == AF_INET && IN4_ARE_ADDR_EQUAL(addr, &c->ip4.gw)) @@ -3016,20 +3029,10 @@ void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af) int i; for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) { - int *s = &pool[i]; - - if (*s >= 0) + if (pool[i] >= 0) break; - *s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); - if (*s > SOCKET_MAX) { - close(*s); - *s = -1; - return; - } - - if (*s >= 0) - tcp_sock_set_bufsize(c, *s); + pool[i] = tcp_conn_new_sock(c, af); } } diff --git a/tcp_conn.h b/tcp_conn.h index 9951b0a..c807e8b 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -192,6 +192,7 @@ void tcp_splice_conn_update(struct ctx *c, struct tcp_splice_conn *new); void tcp_table_compact(struct ctx *c, union tcp_conn *hole); void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union); void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union); +int tcp_conn_pool_sock(int pool[]); void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af); void tcp_splice_refill(const struct ctx *c); diff --git a/tcp_splice.c b/tcp_splice.c index 09f0e3e..3bf6368 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -451,18 +451,14 @@ static int tcp_splice_connect_ns(void *arg) static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, in_port_t port, int outbound) { - int *p, i, s = -1; + int *p, s = -1; if (outbound) p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; else p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; - for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) { - SWAP(s, *p); - if (s >= 0) - break; - } + s = tcp_conn_pool_sock(p); /* No socket available in namespace: create a new one for connect() */ if (s < 0 && !outbound) { -- 2.39.1
When creating a new spliced connection, we need to get a socket in the other ns from the originating one. To avoid excessive ns switches we usually get these from a pool refilled on a timer. However, if the pool runs out we need a fallback. Currently that's done by passing -1 as the socket to tcp_splice_connnect() and running it in the target ns. This means that tcp_splice_connect() itself needs to have different cases depending on whether it's given an existing socket or not, which is a separate concern from what it's mostly doing. We change it to require a suitable open socket to be passed in, and ensuring in the caller that we have one. This requires adding the fallback paths to the caller, tcp_splice_new(). We use slightly different approaches for a socket in the init ns versus the guest ns. This also means that we no longer need to run tcp_splice_connect() itself in the guest ns, which allows us to remove a bunch of boilerplate code. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 2 +- tcp_conn.h | 1 + tcp_splice.c | 89 ++++++++++++++++++---------------------------------- 3 files changed, 32 insertions(+), 60 deletions(-) diff --git a/tcp.c b/tcp.c index a0e2e34..f028e01 100644 --- a/tcp.c +++ b/tcp.c @@ -1882,7 +1882,7 @@ int tcp_conn_pool_sock(int pool[]) * * Return: socket number on success, negative code if socket creation failed */ -static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) { int s; diff --git a/tcp_conn.h b/tcp_conn.h index c807e8b..a499f34 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -193,6 +193,7 @@ void tcp_table_compact(struct ctx *c, union tcp_conn *hole); void tcp_splice_destroy(struct ctx *c, union tcp_conn *conn_union); void tcp_splice_timer(struct ctx *c, union tcp_conn *conn_union); int tcp_conn_pool_sock(int pool[]); +int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af); void tcp_splice_refill(const struct ctx *c); diff --git a/tcp_splice.c b/tcp_splice.c index 3bf6368..84f855e 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -87,6 +87,9 @@ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = { "RCVLOWAT_ACT_B", "CLOSING", }; +/* Forward declaration */ +static int tcp_sock_refill_ns(void *arg); + /** * tcp_splice_conn_epoll_events() - epoll events masks for given state * @events: Connection event flags @@ -347,12 +350,8 @@ static int tcp_splice_connect_finish(const struct ctx *c, * Return: 0 for connect() succeeded or in progress, negative value on error */ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, - int s, in_port_t port) + int sock_conn, in_port_t port) { - int sock_conn = (s >= 0) ? s : socket(CONN_V6(conn) ? AF_INET6 : - AF_INET, - SOCK_STREAM | SOCK_NONBLOCK, - IPPROTO_TCP); struct sockaddr_in6 addr6 = { .sin6_family = AF_INET6, .sin6_port = htons(port), @@ -366,19 +365,8 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, const struct sockaddr *sa; socklen_t sl; - if (sock_conn < 0) - return -errno; - - if (sock_conn > SOCKET_MAX) { - close(sock_conn); - return -EIO; - } - conn->b = sock_conn; - if (s < 0) - tcp_sock_set_bufsize(c, conn->b); - if (setsockopt(conn->b, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) { trace("TCP (spliced): failed to set TCP_QUICKACK on socket %i", @@ -409,36 +397,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, return 0; } -/** - * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns() - * @c: Execution context - * @conn: Accepted inbound connection - * @port: Destination port, host order - * @ret: Return value of tcp_splice_connect_ns() - */ -struct tcp_splice_connect_ns_arg { - const struct ctx *c; - struct tcp_splice_conn *conn; - in_port_t port; - int ret; -}; - -/** - * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect() - * @arg: See struct tcp_splice_connect_ns_arg - * - * Return: 0 - */ -static int tcp_splice_connect_ns(void *arg) -{ - struct tcp_splice_connect_ns_arg *a; - - a = (struct tcp_splice_connect_ns_arg *)arg; - ns_enter(a->c); - a->ret = tcp_splice_connect(a->c, a->conn, -1, a->port); - return 0; -} - /** * tcp_splice_new() - Handle new spliced connection * @c: Execution context @@ -451,24 +409,37 @@ static int tcp_splice_connect_ns(void *arg) static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, in_port_t port, int outbound) { - int *p, s = -1; - - if (outbound) - p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; - else - p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; + int s = -1; + + /* If the pool is empty we take slightly different approaches + * for init or ns sockets. For init sockets we just open a + * new one without refilling the pool to keep latency down. + * For ns sockets, we're going to incur the latency of + * entering the ns anyway, so we might as well refill the + * pool. + */ + if (outbound) { + int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; + int af = CONN_V6(conn) ? AF_INET6 : AF_INET; + + s = tcp_conn_pool_sock(p); + if (s < 0) + s = tcp_conn_new_sock(c, af); + } else { + int *p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; - s = tcp_conn_pool_sock(p); + /* If pool is empty, refill it first */ + if (p[TCP_SOCK_POOL_SIZE-1] < 0) + NS_CALL(tcp_sock_refill_ns, c); - /* No socket available in namespace: create a new one for connect() */ - if (s < 0 && !outbound) { - struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 }; + s = tcp_conn_pool_sock(p); + } - NS_CALL(tcp_splice_connect_ns, &ns_arg); - return ns_arg.ret; + if (s < 0) { + warn("Couldn't open connectable socket for splice (%d)", s); + return s; } - /* Otherwise, the socket will connect on the side it was created on */ return tcp_splice_connect(c, conn, s, port); } -- 2.39.1
On Tue, 14 Feb 2023 10:48:18 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:This is a bit of a diversion from what I'm notionally working on at the moment. While thinking about what we'd need to use the IP_TRANSPARENT socket option to broaden the cases where we can "splice", I noticed some inelegancies in how we handle the pool of pre-opened sockets in the TCP code, and well, here we are. This makes a number of cleanups to the handling of these pools. Most notably, tcp_splice_connect() now has simpler semantics: it now always runs in the init namespace, and is always given a pre-opened socket (which could be in the guest ns). Changes since v1: * Rebased * Improved wording of some commit messages David Gibson (5): tcp: Make a helper to refill each socket pool tcp: Split init and ns cases for tcp_sock_refill() tcp: Move socket pool declarations around tcp: Split pool lookup from creating new sockets in tcp_conn_new_sock() tcp: Improve handling of fallback if socket pool is empty on new spliceApplied too. -- Stefano