To reduce latency, the TCP code maintains several pools of ready to connect TCP sockets. This patch series contains a number of improvements to improve error handling and reporting when we're unable to refill these pools, or unable to obtain a socket from these pools. David Gibson (6): treewide: Use sa_family_t for address family variables tcp: Don't stop refilling socket pool if we find a filled entry tcp: Stop on first error when refilling socket pools tcp, tcp_splice: Issue warnings if unable to refill socket pool tcp, tcp_splice: Helpers for getting sockets from the pools tcp: Don't store errnos in socket pool icmp.c | 6 ++--- icmp.h | 4 +-- inany.h | 3 ++- tcp.c | 73 ++++++++++++++++++++++++++++++++++++++++------------ tcp.h | 2 +- tcp_conn.h | 4 +-- tcp_splice.c | 71 +++++++++++++++++++++++++++++++------------------- udp.c | 2 +- udp.h | 2 +- util.c | 2 +- util.h | 2 +- 11 files changed, 114 insertions(+), 57 deletions(-) -- 2.43.2
Sometimes we use sa_family_t for variables and parameters containing a socket address family, other times we use a plain int. Since sa_family_t is what's actually used in struct sockaddr and friends, standardise on that. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 6 +++--- icmp.h | 4 ++-- inany.h | 3 ++- tcp.c | 12 ++++++------ tcp.h | 2 +- tcp_conn.h | 2 +- tcp_splice.c | 2 +- udp.c | 2 +- udp.h | 2 +- util.c | 2 +- util.h | 2 +- 11 files changed, 20 insertions(+), 19 deletions(-) diff --git a/icmp.c b/icmp.c index 9434fc5a..faa38c81 100644 --- a/icmp.c +++ b/icmp.c @@ -62,7 +62,7 @@ static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS]; * @af: Address family (AF_INET or AF_INET6) * @ref: epoll reference */ -void icmp_sock_handler(const struct ctx *c, int af, union epoll_ref ref) +void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref) { struct icmp_id_sock *const id_sock = af == AF_INET ? &icmp_id_map[V4][ref.icmp.id] : &icmp_id_map[V6][ref.icmp.id]; @@ -156,7 +156,7 @@ static void icmp_ping_close(const struct ctx *c, struct icmp_id_sock *id_sock) * Return: Newly opened ping socket fd, or -1 on failure */ static int icmp_ping_new(const struct ctx *c, struct icmp_id_sock *id_sock, - int af, uint16_t id) + sa_family_t af, uint16_t id) { uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6; const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; @@ -209,7 +209,7 @@ cancel: * * Return: count of consumed packets (always 1, even if malformed) */ -int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, +int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now) { diff --git a/icmp.h b/icmp.h index 00835971..263101db 100644 --- a/icmp.h +++ b/icmp.h @@ -10,8 +10,8 @@ struct ctx; -void icmp_sock_handler(const struct ctx *c, int af, union epoll_ref ref); -int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, +void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref); +int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now); void icmp_timer(const struct ctx *c, const struct timespec *now); diff --git a/inany.h b/inany.h index 90e98f16..fe652ff7 100644 --- a/inany.h +++ b/inany.h @@ -60,7 +60,8 @@ static inline bool inany_equals(const union inany_addr *a, * @af: Address family of @addr * @addr: struct in_addr (IPv4) or struct in6_addr (IPv6) */ -static inline void inany_from_af(union inany_addr *aa, int af, const void *addr) +static inline void inany_from_af(union inany_addr *aa, + sa_family_t af, const void *addr) { if (af == AF_INET6) { aa->a6 = *((struct in6_addr *)addr); diff --git a/tcp.c b/tcp.c index 2ab443d5..74a3e310 100644 --- a/tcp.c +++ b/tcp.c @@ -1262,7 +1262,7 @@ static void tcp_hash_remove(const struct ctx *c, * Return: connection pointer, if found, -ENOENT otherwise */ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, - int af, const void *faddr, + sa_family_t af, const void *faddr, in_port_t eport, in_port_t fport) { union inany_addr aany; @@ -1904,8 +1904,8 @@ static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af) * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp */ -static void tcp_conn_from_tap(struct ctx *c, - int af, const void *saddr, const void *daddr, +static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, + const void *saddr, const void *daddr, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) { @@ -2479,7 +2479,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, * * Return: count of consumed packets */ -int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, +int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now) { @@ -2856,7 +2856,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) * * Return: fd for the new listening socket, negative error code on failure */ -static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port, +static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, const void *addr, const char *ifname) { union tcp_listen_epoll_ref tref = { @@ -3008,7 +3008,7 @@ static int tcp_ns_socks_init(void *arg) * @pool: Pool of sockets to refill * @af: Address family to use */ -void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af) +void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) { int i; diff --git a/tcp.h b/tcp.h index b9f546d3..875006ed 100644 --- a/tcp.h +++ b/tcp.h @@ -14,7 +14,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref); void tcp_listen_handler(struct ctx *c, union epoll_ref ref, const struct timespec *now); void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); -int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, +int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now); int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, diff --git a/tcp_conn.h b/tcp_conn.h index a5f5cfe0..20c7cb8b 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -160,7 +160,7 @@ bool tcp_splice_flow_defer(union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); -void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af); +void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); void tcp_splice_refill(const struct ctx *c); #endif /* TCP_CONN_H */ diff --git a/tcp_splice.c b/tcp_splice.c index 26d32065..cc9745e8 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -399,7 +399,7 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, */ if (pif == PIF_SPLICE) { int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; - int af = CONN_V6(conn) ? AF_INET6 : AF_INET; + sa_family_t af = CONN_V6(conn) ? AF_INET6 : AF_INET; s = tcp_conn_pool_sock(p); if (s < 0) diff --git a/udp.c b/udp.c index 933f24b8..cce475ae 100644 --- a/udp.c +++ b/udp.c @@ -814,7 +814,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, * #syscalls sendmmsg */ int udp_tap_handler(struct ctx *c, uint8_t pif, - int af, const void *saddr, const void *daddr, + sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now) { struct mmsghdr mm[UIO_MAXIOV]; diff --git a/udp.h b/udp.h index 087e4820..f3d5d6d2 100644 --- a/udp.h +++ b/udp.h @@ -11,7 +11,7 @@ void udp_portmap_clear(void); void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); -int udp_tap_handler(struct ctx *c, uint8_t pif, int af, +int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now); int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, diff --git a/util.c b/util.c index 21b35ff9..8acce233 100644 --- a/util.c +++ b/util.c @@ -97,7 +97,7 @@ found: * * Return: newly created socket, negative error code on failure */ -int sock_l4(const struct ctx *c, int af, uint8_t proto, +int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, const void *bind_addr, const char *ifname, uint16_t port, uint32_t data) { diff --git a/util.h b/util.h index d2320f8c..e0df26c6 100644 --- a/util.h +++ b/util.h @@ -212,7 +212,7 @@ struct ipv6_opt_hdr { __attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); } char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto, size_t *dlen); -int sock_l4(const struct ctx *c, int af, uint8_t proto, +int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, const void *bind_addr, const char *ifname, uint16_t port, uint32_t data); void sock_probe_mem(struct ctx *c); -- 2.43.2
Currently tcp_sock_refill_pool() stops as soon as it finds an entry in the pool with a valid fd. This appears to makes sense: we always use fds from the front of the pool, so if we find a filled one, the rest of the pool should be filled as well. However, that's not quite correct. If a previous refill hit errors trying to open new sockets, it could leave gaps between blocks of valid fds. We're going to add some changes that could make that more likely. So, for robustness, instead skip over the filled entry but still try to refill the rest of the array. We expect simply iterating over the pool to be of small cost compared to even a single system call, so this shouldn't have much impact. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 74a3e310..9eec9f3a 100644 --- a/tcp.c +++ b/tcp.c @@ -3014,7 +3014,7 @@ void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) { if (pool[i] >= 0) - break; + continue; pool[i] = tcp_conn_new_sock(c, af); } -- 2.43.2
On Mon, 19 Feb 2024 18:56:47 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Currently tcp_sock_refill_pool() stops as soon as it finds an entry in the pool with a valid fd. This appears to makes sense: we always use fds from the front of the pool, so if we find a filled one, the rest of the pool should be filled as well. However, that's not quite correct. If a previous refill hit errors trying to open new sockets, it could leave gaps between blocks of valid fds. We're going to add some changes that could make that more likely.Uh oh, good catch, I didn't think of the possibility that with 3/6 we could otherwise stop in the middle of a block of "empty" slots, with filled slots occurring after them. -- Stefano
On Wed, Feb 21, 2024 at 10:08:49PM +0100, Stefano Brivio wrote:On Mon, 19 Feb 2024 18:56:47 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Right. The consequences aren't particularly dire if we get this wrong: we don't fill the pool as full as we should, but we should still have fds to work with. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonCurrently tcp_sock_refill_pool() stops as soon as it finds an entry in the pool with a valid fd. This appears to makes sense: we always use fds from the front of the pool, so if we find a filled one, the rest of the pool should be filled as well. However, that's not quite correct. If a previous refill hit errors trying to open new sockets, it could leave gaps between blocks of valid fds. We're going to add some changes that could make that more likely.Uh oh, good catch, I didn't think of the possibility that with 3/6 we could otherwise stop in the middle of a block of "empty" slots, with filled slots occurring after them.
Currently if we get an error opening a new socket while refilling a socket pool, we carry on to the next slot and try again. This isn't very useful, since by far the most likely cause of an error is some sort of resource exhaustion. Trying again will probably just hit the same error, and maybe even make things worse. So, instead stop on the first error while refilling the pool, making do with however many sockets we managed to open before the error. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tcp.c b/tcp.c index 9eec9f3a..d49210bc 100644 --- a/tcp.c +++ b/tcp.c @@ -3016,7 +3016,8 @@ void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) if (pool[i] >= 0) continue; - pool[i] = tcp_conn_new_sock(c, af); + if ((pool[i] = tcp_conn_new_sock(c, af)) < 0) + break; } } -- 2.43.2
Currently if tcp_sock_refill_pool() is unable to fill all the slots in the pool, it will silently exit. This might lead to a later attempt to get fds from the pool to fail at which point it will be harder to tell what originally went wrong. Instead add warnings if we're unable to refill any of the socket pools when requested. We have tcp_sock_refill_pool() return an error and report it in the callers, because those callers have more context allowing for a more useful message. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 24 ++++++++++++++++++------ tcp_conn.h | 2 +- tcp_splice.c | 16 ++++++++++++---- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/tcp.c b/tcp.c index d49210bc..ad56ffc3 100644 --- a/tcp.c +++ b/tcp.c @@ -3007,8 +3007,10 @@ static int tcp_ns_socks_init(void *arg) * @c: Execution context * @pool: Pool of sockets to refill * @af: Address family to use + * + * Return: 0 on success, -ve error code if there was at least one error */ -void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) +int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) { int i; @@ -3017,8 +3019,10 @@ void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) continue; if ((pool[i] = tcp_conn_new_sock(c, af)) < 0) - break; + return pool[i]; } + + return 0; } /** @@ -3027,10 +3031,18 @@ void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) */ static void tcp_sock_refill_init(const struct ctx *c) { - if (c->ifi4) - tcp_sock_refill_pool(c, init_sock_pool4, AF_INET); - if (c->ifi6) - tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); + if (c->ifi4) { + int rc = tcp_sock_refill_pool(c, init_sock_pool4, AF_INET); + if (rc < 0) + warn("TCP: Error refilling IPv4 host socket pool: %s", + strerror(-rc)); + } + if (c->ifi6) { + int rc = tcp_sock_refill_pool(c, init_sock_pool6, AF_INET6); + if (rc < 0) + warn("TCP: Error refilling IPv6 host socket pool: %s", + strerror(-rc)); + } } /** diff --git a/tcp_conn.h b/tcp_conn.h index 20c7cb8b..92d4807a 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -160,7 +160,7 @@ bool tcp_splice_flow_defer(union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); -void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); +int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); void tcp_splice_refill(const struct ctx *c); #endif /* TCP_CONN_H */ diff --git a/tcp_splice.c b/tcp_splice.c index cc9745e8..ee68029b 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -710,10 +710,18 @@ static int tcp_sock_refill_ns(void *arg) ns_enter(c); - if (c->ifi4) - tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); - if (c->ifi6) - tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); + if (c->ifi4) { + int rc = tcp_sock_refill_pool(c, ns_sock_pool4, AF_INET); + if (rc < 0) + warn("TCP: Error refilling IPv4 ns socket pool: %s", + strerror(-rc)); + } + if (c->ifi6) { + int rc = tcp_sock_refill_pool(c, ns_sock_pool6, AF_INET6); + if (rc < 0) + warn("TCP: Error refilling IPv6 ns socket pool: %s", + strerror(-rc)); + } return 0; } -- 2.43.2
On Mon, 19 Feb 2024 18:56:49 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Currently if tcp_sock_refill_pool() is unable to fill all the slots in the pool, it will silently exit. This might lead to a later attempt to get fds from the pool to fail at which point it will be harder to tell what originally went wrong. Instead add warnings if we're unable to refill any of the socket pools when requested. We have tcp_sock_refill_pool() return an error and report it in the callers, because those callers have more context allowing for a more useful message. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 24 ++++++++++++++++++------ tcp_conn.h | 2 +- tcp_splice.c | 16 ++++++++++++---- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/tcp.c b/tcp.c index d49210bc..ad56ffc3 100644 --- a/tcp.c +++ b/tcp.c @@ -3007,8 +3007,10 @@ static int tcp_ns_socks_init(void *arg) * @c: Execution context * @pool: Pool of sockets to refill * @af: Address family to use + * + * Return: 0 on success, -ve error code if there was at least one errorIs -ve an abbreviation for something or just a typo? It sounds like the voltage of the emitter in a BJT transistor. Must be a typo, the rest of the patch looks good to me, I can also fix this up while applying. -- Stefano
On Wed, Feb 21, 2024 at 10:09:10PM +0100, Stefano Brivio wrote:On Mon, 19 Feb 2024 18:56:49 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Oh, it's supposed to be an abbreviation for "negative".Currently if tcp_sock_refill_pool() is unable to fill all the slots in the pool, it will silently exit. This might lead to a later attempt to get fds from the pool to fail at which point it will be harder to tell what originally went wrong. Instead add warnings if we're unable to refill any of the socket pools when requested. We have tcp_sock_refill_pool() return an error and report it in the callers, because those callers have more context allowing for a more useful message. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 24 ++++++++++++++++++------ tcp_conn.h | 2 +- tcp_splice.c | 16 ++++++++++++---- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/tcp.c b/tcp.c index d49210bc..ad56ffc3 100644 --- a/tcp.c +++ b/tcp.c @@ -3007,8 +3007,10 @@ static int tcp_ns_socks_init(void *arg) * @c: Execution context * @pool: Pool of sockets to refill * @af: Address family to use + * + * Return: 0 on success, -ve error code if there was at least one errorIs -ve an abbreviation for something or just a typo? It sounds like the voltage of the emitter in a BJT transistor.Must be a typo, the rest of the patch looks good to me, I can also fix this up while applying.Go ahead, please. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On Thu, 22 Feb 2024 08:44:48 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Wed, Feb 21, 2024 at 10:09:10PM +0100, Stefano Brivio wrote:Funny, I just found https://en.wiktionary.org/wiki/-ve#English, but I never used "-ve" like that. Well, for consistency, I would change this to "negative" anyway. -- StefanoOn Mon, 19 Feb 2024 18:56:49 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Oh, it's supposed to be an abbreviation for "negative".Currently if tcp_sock_refill_pool() is unable to fill all the slots in the pool, it will silently exit. This might lead to a later attempt to get fds from the pool to fail at which point it will be harder to tell what originally went wrong. Instead add warnings if we're unable to refill any of the socket pools when requested. We have tcp_sock_refill_pool() return an error and report it in the callers, because those callers have more context allowing for a more useful message. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 24 ++++++++++++++++++------ tcp_conn.h | 2 +- tcp_splice.c | 16 ++++++++++++---- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/tcp.c b/tcp.c index d49210bc..ad56ffc3 100644 --- a/tcp.c +++ b/tcp.c @@ -3007,8 +3007,10 @@ static int tcp_ns_socks_init(void *arg) * @c: Execution context * @pool: Pool of sockets to refill * @af: Address family to use + * + * Return: 0 on success, -ve error code if there was at least one errorIs -ve an abbreviation for something or just a typo? It sounds like the voltage of the emitter in a BJT transistor.
We maintain pools of ready-to-connect sockets in both the original and (for pasta) guest namespace to reduce latency when starting new TCP connections. If we exhaust those pools we have to take a higher latency path to get a new socket. Currently we open-code that fallback in the places we need it. To improve clarity encapsulate that into helper functions. While we're at it, give those helpers clearer error reporting. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 34 +++++++++++++++++++++++++++----- tcp_conn.h | 2 +- tcp_splice.c | 55 ++++++++++++++++++++++++++++++---------------------- 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/tcp.c b/tcp.c index ad56ffc3..34e32641 100644 --- a/tcp.c +++ b/tcp.c @@ -1792,7 +1792,7 @@ int tcp_conn_pool_sock(int pool[]) * * Return: socket number on success, negative code if socket creation failed */ -int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) { int s; @@ -1811,6 +1811,32 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) return s; } +/** + * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace + * @c: Execution context + * @af: Address family (AF_INET or AF_INET6) + * + * Return: Socket fd on success, -errno on failure + */ +int tcp_conn_sock(const struct ctx *c, sa_family_t af) +{ + int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; + int s; + + if ((s = tcp_conn_pool_sock(pool)) >= 0) + return s; + + /* If the pool is empty we just open a new one without refilling the + * pool to keep latency down. + */ + if ((s = tcp_conn_new_sock(c, af)) >= 0) + return s; + + err("TCP: Unable to open socket for new connection: %s", + strerror(-s)); + return -1; +} + /** * tcp_conn_tap_mss() - Get MSS value advertised by tap/guest * @conn: Connection pointer @@ -1909,7 +1935,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) { - int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; struct sockaddr_in addr4 = { .sin_family = AF_INET, .sin_port = th->dest, @@ -1931,9 +1956,8 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, if (!(flow = flow_alloc())) return; - if ((s = tcp_conn_pool_sock(pool)) < 0) - if ((s = tcp_conn_new_sock(c, af)) < 0) - goto cancel; + if ((s = tcp_conn_sock(c, af)) < 0) + goto cancel; if (!c->no_map_gw) { if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw)) diff --git a/tcp_conn.h b/tcp_conn.h index 92d4807a..d280b222 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -159,7 +159,7 @@ bool tcp_flow_defer(union flow *flow); bool tcp_splice_flow_defer(union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); -int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); +int tcp_conn_sock(const struct ctx *c, sa_family_t af); int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); void tcp_splice_refill(const struct ctx *c); diff --git a/tcp_splice.c b/tcp_splice.c index ee68029b..5b38a82d 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -376,6 +376,34 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, return 0; } +/** + * tcp_conn_sock_ns() - Obtain a connectable socket in the namespace + * @c: Execution context + * @af: Address family (AF_INET or AF_INET6) + * + * Return: Socket fd in the namespace on success, -errno on failure + */ +static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af) +{ + int *p = af == AF_INET6 ? ns_sock_pool6 : ns_sock_pool4; + int s; + + if ((s = tcp_conn_pool_sock(p)) >= 0) + return s; + + /* If the pool is empty we have to incur the latency of entering the ns. + * Therefore, we might as well refill the whole pool while we're at it. + * This differs from tcp_conn_sock(). + */ + NS_CALL(tcp_sock_refill_ns, c); + + if ((s = tcp_conn_pool_sock(p)) >= 0) + return s; + + err("TCP: No available ns sockets for new connection"); + return -1; +} + /** * tcp_splice_new() - Handle new spliced connection * @c: Execution context @@ -388,38 +416,19 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, in_port_t port, uint8_t pif) { + sa_family_t af = CONN_V6(conn) ? AF_INET6 : AF_INET; int s = -1; - /* If the pool is empty we take slightly different approaches - * for init or ns sockets. For init sockets we just open a - * new one without refilling the pool to keep latency down. - * For ns sockets, we're going to incur the latency of - * entering the ns anyway, so we might as well refill the - * pool. - */ if (pif == PIF_SPLICE) { - int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; - sa_family_t af = CONN_V6(conn) ? AF_INET6 : AF_INET; - - s = tcp_conn_pool_sock(p); - if (s < 0) - s = tcp_conn_new_sock(c, af); + s = tcp_conn_sock(c, af); } else { - int *p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; - ASSERT(pif == PIF_HOST); - /* If pool is empty, refill it first */ - if (p[TCP_SOCK_POOL_SIZE-1] < 0) - NS_CALL(tcp_sock_refill_ns, c); - - s = tcp_conn_pool_sock(p); + s = tcp_conn_sock_ns(c, af); } - if (s < 0) { - warn("Couldn't open connectable socket for splice (%d)", s); + if (s < 0) return s; - } return tcp_splice_connect(c, conn, s, port); } -- 2.43.2
If tcp_sock_refill_pool() gets an error opening new sockets, it stores the negative errno of that error in the socket pool. This isn't especially useful: * It's inconsistent with the initial state of the pool (all -1) * It's inconsistent with the state of an entry that was valid and was then consumed (also -1) * By the time we did anything with this error code, it's now far removed from the situation in which the error occurred, making it difficult to report usefully We now have error reporting closer to when failures happen on the refill paths, so just leave a pool slot we can't fill as -1. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 34e32641..27865691 100644 --- a/tcp.c +++ b/tcp.c @@ -3039,11 +3039,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) int i; for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) { + int fd; if (pool[i] >= 0) continue; - if ((pool[i] = tcp_conn_new_sock(c, af)) < 0) - return pool[i]; + if ((fd = tcp_conn_new_sock(c, af)) < 0) + return fd; + pool[i] = fd; } return 0; -- 2.43.2
On Mon, 19 Feb 2024 18:56:51 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:If tcp_sock_refill_pool() gets an error opening new sockets, it stores the negative errno of that error in the socket pool. This isn't especially useful: * It's inconsistent with the initial state of the pool (all -1) * It's inconsistent with the state of an entry that was valid and was then consumed (also -1) * By the time we did anything with this error code, it's now far removed from the situation in which the error occurred, making it difficult to report usefully We now have error reporting closer to when failures happen on the refill paths, so just leave a pool slot we can't fill as -1. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 34e32641..27865691 100644 --- a/tcp.c +++ b/tcp.c @@ -3039,11 +3039,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) int i; for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) { + int fd;Usual newline between declaration and code. Rest of the series looks good to me, same as 4/6, if you agree I can fix it up, or respin, you choose. -- Stefano
On Wed, Feb 21, 2024 at 10:09:34PM +0100, Stefano Brivio wrote:On Mon, 19 Feb 2024 18:56:51 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Oops. Go ahead and fix on merge, please. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonIf tcp_sock_refill_pool() gets an error opening new sockets, it stores the negative errno of that error in the socket pool. This isn't especially useful: * It's inconsistent with the initial state of the pool (all -1) * It's inconsistent with the state of an entry that was valid and was then consumed (also -1) * By the time we did anything with this error code, it's now far removed from the situation in which the error occurred, making it difficult to report usefully We now have error reporting closer to when failures happen on the refill paths, so just leave a pool slot we can't fill as -1. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 34e32641..27865691 100644 --- a/tcp.c +++ b/tcp.c @@ -3039,11 +3039,13 @@ int tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) int i; for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) { + int fd;Usual newline between declaration and code. Rest of the series looks good to me, same as 4/6, if you agree I can fix it up, or respin, you choose.
On Mon, 19 Feb 2024 18:56:45 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:To reduce latency, the TCP code maintains several pools of ready to connect TCP sockets. This patch series contains a number of improvements to improve error handling and reporting when we're unable to refill these pools, or unable to obtain a socket from these pools. David Gibson (6): treewide: Use sa_family_t for address family variables tcp: Don't stop refilling socket pool if we find a filled entry tcp: Stop on first error when refilling socket pools tcp, tcp_splice: Issue warnings if unable to refill socket pool tcp, tcp_splice: Helpers for getting sockets from the pools tcp: Don't store errnos in socket poolApplied (with minor changes as mentioned for 4/6 and 6/6). -- Stefano