Here's another batch of cleanups and tweaks in preparation for the flow table. This set focuses on improved helpers for handling addresses, particularly in the TCP splice path. David Gibson (16): treewide: Use sa_family_t for address family variables tcp, udp: Don't precompute port remappings in epoll references flow: Add helper to determine a flow's protocol tcp_splice: Simplify clean up logic inany: Helper to test for IPv4 or IPv6 loopback address tcp, tcp_splice: Helpers for getting sockets from the pools tcp_splice: More specific variable names in new splice path tcp_splice: Fix incorrect parameter comment for tcp_splice_connect() tcp_splice: Merge tcp_splice_new() into its caller tcp_splice: Improve error reporting on connect path inany: Add inany_ntop() helper tcp_splice: Improve logic deciding when to splice util: Provide global constants for IPv4 loopback and unspecified address inany: Introduce union sockaddr_inany tcp, tcp_splice: Better construction of IPv4 or IPv6 sockaddrs inany: Extend inany_from_af to easily set unspecified addresses Makefile | 6 +- flow.c | 7 ++ flow.h | 4 + icmp.c | 24 +++--- icmp.h | 4 +- inany.c | 34 ++++++++ inany.h | 96 ++++++++++++++++++---- tcp.c | 106 ++++++++++++------------ tcp.h | 4 +- tcp_conn.h | 4 +- tcp_splice.c | 223 ++++++++++++++++++++++++++------------------------- tcp_splice.h | 5 +- udp.c | 21 ++--- udp.h | 2 +- util.c | 5 +- util.h | 4 +- 16 files changed, 335 insertions(+), 214 deletions(-) create mode 100644 inany.c -- 2.43.0
Sometimes we use sa_family_t for variables and parameters containing a socket address family, other times we use a plain int. Since sa_family_t is what's actually used in struct sockaddr and friends, standardise on that. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> # Conflicts: # tcp_conn.h --- icmp.c | 6 +++--- icmp.h | 4 ++-- inany.h | 3 ++- tcp.c | 12 ++++++------ tcp.h | 2 +- tcp_conn.h | 2 +- tcp_splice.c | 2 +- udp.c | 2 +- udp.h | 2 +- util.c | 2 +- util.h | 2 +- 11 files changed, 20 insertions(+), 19 deletions(-) diff --git a/icmp.c b/icmp.c index 9434fc5a..faa38c81 100644 --- a/icmp.c +++ b/icmp.c @@ -62,7 +62,7 @@ static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS]; * @af: Address family (AF_INET or AF_INET6) * @ref: epoll reference */ -void icmp_sock_handler(const struct ctx *c, int af, union epoll_ref ref) +void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref) { struct icmp_id_sock *const id_sock = af == AF_INET ? &icmp_id_map[V4][ref.icmp.id] : &icmp_id_map[V6][ref.icmp.id]; @@ -156,7 +156,7 @@ static void icmp_ping_close(const struct ctx *c, struct icmp_id_sock *id_sock) * Return: Newly opened ping socket fd, or -1 on failure */ static int icmp_ping_new(const struct ctx *c, struct icmp_id_sock *id_sock, - int af, uint16_t id) + sa_family_t af, uint16_t id) { uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6; const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; @@ -209,7 +209,7 @@ cancel: * * Return: count of consumed packets (always 1, even if malformed) */ -int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, +int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now) { diff --git a/icmp.h b/icmp.h index 00835971..263101db 100644 --- a/icmp.h +++ b/icmp.h @@ -10,8 +10,8 @@ struct ctx; -void icmp_sock_handler(const struct ctx *c, int af, union epoll_ref ref); -int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, +void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref); +int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now); void icmp_timer(const struct ctx *c, const struct timespec *now); diff --git a/inany.h b/inany.h index 90e98f16..fe652ff7 100644 --- a/inany.h +++ b/inany.h @@ -60,7 +60,8 @@ static inline bool inany_equals(const union inany_addr *a, * @af: Address family of @addr * @addr: struct in_addr (IPv4) or struct in6_addr (IPv6) */ -static inline void inany_from_af(union inany_addr *aa, int af, const void *addr) +static inline void inany_from_af(union inany_addr *aa, + sa_family_t af, const void *addr) { if (af == AF_INET6) { aa->a6 = *((struct in6_addr *)addr); diff --git a/tcp.c b/tcp.c index 905d26f6..fdf56713 100644 --- a/tcp.c +++ b/tcp.c @@ -1262,7 +1262,7 @@ static void tcp_hash_remove(const struct ctx *c, * Return: connection pointer, if found, -ENOENT otherwise */ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, - int af, const void *faddr, + sa_family_t af, const void *faddr, in_port_t eport, in_port_t fport) { union inany_addr aany; @@ -1904,8 +1904,8 @@ static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af) * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp */ -static void tcp_conn_from_tap(struct ctx *c, - int af, const void *saddr, const void *daddr, +static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, + const void *saddr, const void *daddr, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) { @@ -2480,7 +2480,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, * * Return: count of consumed packets */ -int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, +int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now) { @@ -2857,7 +2857,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) * * Return: fd for the new listening socket, negative error code on failure */ -static int tcp_sock_init_af(const struct ctx *c, int af, in_port_t port, +static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, const void *addr, const char *ifname) { union tcp_listen_epoll_ref tref = { @@ -3009,7 +3009,7 @@ static int tcp_ns_socks_init(void *arg) * @pool: Pool of sockets to refill * @af: Address family to use */ -void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af) +void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af) { int i; diff --git a/tcp.h b/tcp.h index b9f546d3..875006ed 100644 --- a/tcp.h +++ b/tcp.h @@ -14,7 +14,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref); void tcp_listen_handler(struct ctx *c, union epoll_ref ref, const struct timespec *now); void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); -int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, +int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now); int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, diff --git a/tcp_conn.h b/tcp_conn.h index a5f5cfe0..20c7cb8b 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -160,7 +160,7 @@ bool tcp_splice_flow_defer(union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); -void tcp_sock_refill_pool(const struct ctx *c, int pool[], int af); +void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); void tcp_splice_refill(const struct ctx *c); #endif /* TCP_CONN_H */ diff --git a/tcp_splice.c b/tcp_splice.c index 26d32065..cc9745e8 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -399,7 +399,7 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, */ if (pif == PIF_SPLICE) { int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; - int af = CONN_V6(conn) ? AF_INET6 : AF_INET; + sa_family_t af = CONN_V6(conn) ? AF_INET6 : AF_INET; s = tcp_conn_pool_sock(p); if (s < 0) diff --git a/udp.c b/udp.c index b5b8f8a7..c839e269 100644 --- a/udp.c +++ b/udp.c @@ -814,7 +814,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, * #syscalls sendmmsg */ int udp_tap_handler(struct ctx *c, uint8_t pif, - int af, const void *saddr, const void *daddr, + sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now) { struct mmsghdr mm[UIO_MAXIOV]; diff --git a/udp.h b/udp.h index 087e4820..f3d5d6d2 100644 --- a/udp.h +++ b/udp.h @@ -11,7 +11,7 @@ void udp_portmap_clear(void); void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); -int udp_tap_handler(struct ctx *c, uint8_t pif, int af, +int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now); int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, diff --git a/util.c b/util.c index 21b35ff9..8acce233 100644 --- a/util.c +++ b/util.c @@ -97,7 +97,7 @@ found: * * Return: newly created socket, negative error code on failure */ -int sock_l4(const struct ctx *c, int af, uint8_t proto, +int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, const void *bind_addr, const char *ifname, uint16_t port, uint32_t data) { diff --git a/util.h b/util.h index d2320f8c..e0df26c6 100644 --- a/util.h +++ b/util.h @@ -212,7 +212,7 @@ struct ipv6_opt_hdr { __attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); } char *ipv6_l4hdr(const struct pool *p, int idx, size_t offset, uint8_t *proto, size_t *dlen); -int sock_l4(const struct ctx *c, int af, uint8_t proto, +int sock_l4(const struct ctx *c, sa_family_t af, uint8_t proto, const void *bind_addr, const char *ifname, uint16_t port, uint32_t data); void sock_probe_mem(struct ctx *c); -- 2.43.0
The epoll references for both TCP listening sockets and UDP sockets includes a port number. This gives the destination port that traffic to that socket will be sent to on the other side. That will usually be the same as the socket's bound port, but might not if the -t, -u, -T or -U options are given with different original and forwarded port numbers. As we move towards a more flexible forwarding model for passt, it's going to become possible for that destination port to vary depending on more things (for example the source or destination address). So, it will no longer make sense to have a fixed value for a listening socket. Change to simpler semantics where this field in the reference gives the bound port of the socket. We apply the translations to the correct destination port later on, when we're actually forwarding. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 8 ++++---- tcp.h | 2 +- tcp_splice.c | 4 ++++ udp.c | 14 ++++++++------ 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tcp.c b/tcp.c index fdf56713..cd833728 100644 --- a/tcp.c +++ b/tcp.c @@ -2676,7 +2676,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, conn_event(c, conn, SOCK_ACCEPTED); inany_from_sockaddr(&conn->faddr, &conn->fport, sa); - conn->eport = ref.port; + conn->eport = ref.port + c->tcp.fwd_in.delta[ref.port]; tcp_snat_inbound(c, &conn->faddr); @@ -2861,7 +2861,7 @@ static int tcp_sock_init_af(const struct ctx *c, sa_family_t af, in_port_t port, const void *addr, const char *ifname) { union tcp_listen_epoll_ref tref = { - .port = port + c->tcp.fwd_in.delta[port], + .port = port, .pif = PIF_HOST, }; int s; @@ -2923,7 +2923,7 @@ int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) { union tcp_listen_epoll_ref tref = { - .port = port + c->tcp.fwd_out.delta[port], + .port = port, .pif = PIF_SPLICE, }; struct in_addr loopback = IN4ADDR_LOOPBACK_INIT; @@ -2949,7 +2949,7 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) static void tcp_ns_sock_init6(const struct ctx *c, in_port_t port) { union tcp_listen_epoll_ref tref = { - .port = port + c->tcp.fwd_out.delta[port], + .port = port, .pif = PIF_SPLICE, }; int s; diff --git a/tcp.h b/tcp.h index 875006ed..5e6756d4 100644 --- a/tcp.h +++ b/tcp.h @@ -37,7 +37,7 @@ union tcp_epoll_ref { /** * union tcp_listen_epoll_ref - epoll reference portion for TCP listening - * @port: Port number we're forwarding *to* (listening port plus delta) + * @port: Bound port number of the socket * @pif: pif in which the socket is listening * @u32: Opaque u32 value of reference */ diff --git a/tcp_splice.c b/tcp_splice.c index cc9745e8..b8d64eba 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -401,6 +401,8 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; sa_family_t af = CONN_V6(conn) ? AF_INET6 : AF_INET; + port += c->tcp.fwd_out.delta[port]; + s = tcp_conn_pool_sock(p); if (s < 0) s = tcp_conn_new_sock(c, af); @@ -409,6 +411,8 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, ASSERT(pif == PIF_HOST); + port += c->tcp.fwd_in.delta[port]; + /* If pool is empty, refill it first */ if (p[TCP_SOCK_POOL_SIZE-1] < 0) NS_CALL(tcp_sock_refill_ns, c); diff --git a/udp.c b/udp.c index c839e269..02cb7889 100644 --- a/udp.c +++ b/udp.c @@ -762,6 +762,11 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, if (c->no_udp || !(events & EPOLLIN)) return; + if (ref.udp.pif == PIF_SPLICE) + dstport += c->udp.fwd_out.f.delta[dstport]; + else if (ref.udp.pif == PIF_HOST) + dstport += c->udp.fwd_in.f.delta[dstport]; + if (v6) { mmh_recv = udp6_l2_mh_sock; udp6_localname.sin6_port = htons(dstport); @@ -989,16 +994,13 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { union udp_epoll_ref uref = { .splice = (c->mode == MODE_PASTA), - .orig = true }; + .orig = true, .port = port }; int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; - if (ns) { + if (ns) uref.pif = PIF_SPLICE; - uref.port = (in_port_t)(port + c->udp.fwd_out.f.delta[port]); - } else { + else uref.pif = PIF_HOST; - uref.port = (in_port_t)(port + c->udp.fwd_in.f.delta[port]); - } if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { uref.v6 = 0; -- 2.43.0
Each flow already has a type field. This implies the protocol the flow represents, but also has more information: we have two ways to represent TCP flows, "tap" and "spliced". In order to generalise some of the flow mechanics, we'll need to determine a flow's protocol in terms of the IP (L4) protocol number. Introduce a constant table and helper macro to derive this from the flow type. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.c | 7 +++++++ flow.h | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/flow.c b/flow.c index 5e94a7a9..beb9749c 100644 --- a/flow.c +++ b/flow.c @@ -25,6 +25,13 @@ const char *flow_type_str[] = { static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, "flow_type_str[] doesn't match enum flow_type"); +const uint8_t flow_proto[] = { + [FLOW_TCP] = IPPROTO_TCP, + [FLOW_TCP_SPLICE] = IPPROTO_TCP, +}; +static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, + "flow_proto[] doesn't match enum flow_type"); + /* Global Flow Table */ /** diff --git a/flow.h b/flow.h index 48a0ab4b..e9b3ce3e 100644 --- a/flow.h +++ b/flow.h @@ -27,6 +27,10 @@ extern const char *flow_type_str[]; #define FLOW_TYPE(f) \ ((f)->type < FLOW_NUM_TYPES ? flow_type_str[(f)->type] : "?") +extern const uint8_t flow_proto[]; +#define FLOW_PROTO(f) \ + ((f)->type < FLOW_NUM_TYPES ? flow_proto[(f)->type] : 0) + /** * struct flow_common - Common fields for packet flows * @type: Type of packet flow -- 2.43.0
Currently tcp_splice_flow_defer() contains specific logic to determine if we're far enough initialised that we need to close pipes and/or sockets. This is potentially fragile if we change something about the order in which we do things. We can simplify this by initialising the pipe and socket fields to -1 very early, then close()ing them if and only if they're non negative. This lets us remove a special case cleanup if our connect() fails. This will already trigger a CLOSING event, and the socket fd in question is populated in the connection structure. Thus we can let the new cleanup logic handle it rather than requiring an explicit close(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp_splice.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index b8d64eba..76258e89 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -246,16 +246,14 @@ bool tcp_splice_flow_defer(union flow *flow) return false; for (side = 0; side < SIDES; side++) { - if (conn->events & SPLICE_ESTABLISHED) { - /* Flushing might need to block: don't recycle them. */ - if (conn->pipe[side][0] != -1) { - close(conn->pipe[side][0]); - close(conn->pipe[side][1]); - conn->pipe[side][0] = conn->pipe[side][1] = -1; - } + /* Flushing might need to block: don't recycle them. */ + if (conn->pipe[side][0] >= 0) { + close(conn->pipe[side][0]); + close(conn->pipe[side][1]); + conn->pipe[side][0] = conn->pipe[side][1] = -1; } - if (side == 0 || conn->events & SPLICE_CONNECT) { + if (conn->s[side] >= 0) { close(conn->s[side]); conn->s[side] = -1; } @@ -284,8 +282,6 @@ static int tcp_splice_connect_finish(const struct ctx *c, int i = 0; for (side = 0; side < SIDES; side++) { - conn->pipe[side][0] = conn->pipe[side][1] = -1; - for (; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { if (splice_pipe_pool[i][0] >= 0) { SWAP(conn->pipe[side][0], @@ -361,12 +357,8 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, } if (connect(conn->s[1], sa, sl)) { - if (errno != EINPROGRESS) { - int ret = -errno; - - close(sock_conn); - return ret; - } + if (errno != EINPROGRESS) + return -errno; conn_event(c, conn, SPLICE_CONNECT); } else { conn_event(c, conn, SPLICE_ESTABLISHED); @@ -468,6 +460,9 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, conn->f.type = FLOW_TCP_SPLICE; conn->s[0] = s; + conn->s[1] = -1; + conn->pipe[0][0] = conn->pipe[0][1] = -1; + conn->pipe[1][0] = conn->pipe[1][1] = -1; if (tcp_splice_new(c, conn, ref.port, ref.pif)) conn_flag(c, conn, CLOSING); -- 2.43.0
tcp_splice_conn_from_sock() needs to check if an inany is either the IPv6 loopback address (::) or an IPv4 loopback address (127.0.0.0/8). We may have other places that also want to check this in future, so simplify it with an inany_is_loopback() helper. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- inany.h | 13 +++++++++++++ tcp_splice.c | 14 +++----------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/inany.h b/inany.h index fe652ff7..d758f7ba 100644 --- a/inany.h +++ b/inany.h @@ -55,6 +55,19 @@ static inline bool inany_equals(const union inany_addr *a, return IN6_ARE_ADDR_EQUAL(&a->a6, &b->a6); } +/** inany_is_loopback - Check if address is loopback + * @a: IPv[46] address + * + * Return: true if @a is either ::1 or in 127.0.0.1/8 + */ +static inline bool inany_is_loopback(const union inany_addr *a) +{ + const struct in_addr *v4; + + return IN6_IS_ADDR_LOOPBACK(&a->a6) || + ((v4 = inany_v4(a)) && IN4_IS_ADDR_LOOPBACK(v4)); +} + /** inany_from_af - Set IPv[46] address from IPv4 or IPv6 address * @aa: Pointer to store IPv[46] address * @af: Address family of @addr diff --git a/tcp_splice.c b/tcp_splice.c index 76258e89..fbce4879 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -436,24 +436,16 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, struct tcp_splice_conn *conn, int s, const struct sockaddr *sa) { - const struct in_addr *a4; union inany_addr aany; in_port_t port; ASSERT(c->mode == MODE_PASTA); inany_from_sockaddr(&aany, &port, sa); - a4 = inany_v4(&aany); + if (!inany_is_loopback(&aany)) + return false; - if (a4) { - if (!IN4_IS_ADDR_LOOPBACK(a4)) - return false; - conn->flags = 0; - } else { - if (!IN6_IS_ADDR_LOOPBACK(&aany.a6)) - return false; - conn->flags = SPLICE_V6; - } + conn->flags = inany_v4(&aany) ? 0 : SPLICE_V6; if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) flow_trace(conn, "failed to set TCP_QUICKACK on %i", s); -- 2.43.0
We maintain pools of ready-to-connect sockets in both the original and (for pasta) guest namespace to reduce latency when starting new TCP connections. If we exhaust those pools we have to take a higher latency path to get a new socket. Currently we open-code that fallback in the places we need it. To improve clarity encapsulate that into helper functions. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 29 ++++++++++++++++++++++++----- tcp_conn.h | 2 +- tcp_splice.c | 46 +++++++++++++++++++++++++--------------------- 3 files changed, 50 insertions(+), 27 deletions(-) diff --git a/tcp.c b/tcp.c index cd833728..8daefe99 100644 --- a/tcp.c +++ b/tcp.c @@ -1792,7 +1792,7 @@ int tcp_conn_pool_sock(int pool[]) * * Return: socket number on success, negative code if socket creation failed */ -int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) +static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) { int s; @@ -1811,6 +1811,27 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) return s; } +/** + * tcp_conn_sock() - Obtain a connectable socket in the host/init namespace + * @c: Execution context + * @af: Address family (AF_INET or AF_INET6) + * + * Return: Socket fd on success, -errno on failure + */ +int tcp_conn_sock(const struct ctx *c, sa_family_t af) +{ + int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; + int s; + + if ((s = tcp_conn_pool_sock(pool)) >= 0) + return s; + + /* If the pool is empty we just open a new one without refilling the + * pool to keep latency down. + */ + return tcp_conn_new_sock(c, af); +} + /** * tcp_conn_tap_mss() - Get MSS value advertised by tap/guest * @conn: Connection pointer @@ -1909,7 +1930,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) { - int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4; struct sockaddr_in addr4 = { .sin_family = AF_INET, .sin_port = th->dest, @@ -1931,9 +1951,8 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, if (!(flow = flow_alloc())) return; - if ((s = tcp_conn_pool_sock(pool)) < 0) - if ((s = tcp_conn_new_sock(c, af)) < 0) - goto cancel; + if ((s = tcp_conn_sock(c, af)) < 0) + goto cancel; if (!c->no_map_gw) { if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw)) diff --git a/tcp_conn.h b/tcp_conn.h index 20c7cb8b..e55edafe 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -159,7 +159,7 @@ bool tcp_flow_defer(union flow *flow); bool tcp_splice_flow_defer(union flow *flow); void tcp_splice_timer(const struct ctx *c, union flow *flow); int tcp_conn_pool_sock(int pool[]); -int tcp_conn_new_sock(const struct ctx *c, sa_family_t af); +int tcp_conn_sock(const struct ctx *c, sa_family_t af); void tcp_sock_refill_pool(const struct ctx *c, int pool[], sa_family_t af); void tcp_splice_refill(const struct ctx *c); diff --git a/tcp_splice.c b/tcp_splice.c index fbce4879..16be0c59 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -90,7 +90,7 @@ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = { }; /* Forward declaration */ -static int tcp_sock_refill_ns(void *arg); +static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af); /** * tcp_splice_conn_epoll_events() - epoll events masks for given state @@ -380,36 +380,19 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, in_port_t port, uint8_t pif) { + sa_family_t af = CONN_V6(conn) ? AF_INET6 : AF_INET; int s = -1; - /* If the pool is empty we take slightly different approaches - * for init or ns sockets. For init sockets we just open a - * new one without refilling the pool to keep latency down. - * For ns sockets, we're going to incur the latency of - * entering the ns anyway, so we might as well refill the - * pool. - */ if (pif == PIF_SPLICE) { - int *p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; - sa_family_t af = CONN_V6(conn) ? AF_INET6 : AF_INET; - port += c->tcp.fwd_out.delta[port]; - s = tcp_conn_pool_sock(p); - if (s < 0) - s = tcp_conn_new_sock(c, af); + s = tcp_conn_sock(c, af); } else { - int *p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; - ASSERT(pif == PIF_HOST); port += c->tcp.fwd_in.delta[port]; - /* If pool is empty, refill it first */ - if (p[TCP_SOCK_POOL_SIZE-1] < 0) - NS_CALL(tcp_sock_refill_ns, c); - - s = tcp_conn_pool_sock(p); + s = tcp_conn_sock_ns(c, af); } if (s < 0) { @@ -709,6 +692,27 @@ static int tcp_sock_refill_ns(void *arg) return 0; } +/** + * tcp_conn_sock_ns() - Obtain a connectable socket in the namespace + * @c: Execution context + * @af: Address family (AF_INET or AF_INET6) + * + * Return: Socket fd in the namespace on success, -errno on failure + */ +static int tcp_conn_sock_ns(const struct ctx *c, sa_family_t af) +{ + int *p = af == AF_INET6 ? ns_sock_pool6 : ns_sock_pool4; + + /* If the pool is empty we have to incur the latency of entering the ns. + * Therefore, we might as well refill the whole pool while we're at it, + * which differs from tcp_conn_sock(). + */ + if (p[TCP_SOCK_POOL_SIZE-1] < 0) + NS_CALL(tcp_sock_refill_ns, c); + + return tcp_conn_pool_sock(p); +} + /** * tcp_splice_refill() - Refill pools of resources needed for splicing * @c: Execution context -- 2.43.0
In tcp_splice_conn_from_sock(), the 'port' variable stores the source port of the connection on the originating side. In tcp_splice_new(), called directly from it, the 'port' parameter gives the _destination_ port of the originating connection and is then updated to the destination port of the connection on the other side. Similarly, in tcp_splice_conn_from_sock(), 's' is the fd of the accetped socket (on side 0), whereas in tcp_splice_new(), 's' is the fd of the connecting socket (side 1). I, for one, find having the same variable name with different meanings in such close proximity in the flow of control pretty confusing. Alter the names for greater specificity and clarity. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp_splice.c | 40 ++++++++++++++++++++-------------------- tcp_splice.h | 2 +- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index 16be0c59..bdabe411 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -378,29 +378,29 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, * Return: return code from connect() */ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, - in_port_t port, uint8_t pif) + in_port_t dstport, uint8_t pif) { sa_family_t af = CONN_V6(conn) ? AF_INET6 : AF_INET; - int s = -1; + int s1; if (pif == PIF_SPLICE) { - port += c->tcp.fwd_out.delta[port]; + dstport += c->tcp.fwd_out.delta[dstport]; - s = tcp_conn_sock(c, af); + s1 = tcp_conn_sock(c, af); } else { ASSERT(pif == PIF_HOST); - port += c->tcp.fwd_in.delta[port]; + dstport += c->tcp.fwd_in.delta[dstport]; - s = tcp_conn_sock_ns(c, af); + s1 = tcp_conn_sock_ns(c, af); } - if (s < 0) { - warn("Couldn't open connectable socket for splice (%d)", s); - return s; + if (s1 < 0) { + warn("Couldn't open connectable socket for splice (%d)", s1); + return s1; } - return tcp_splice_connect(c, conn, s, port); + return tcp_splice_connect(c, conn, s1, dstport); } /** @@ -408,7 +408,7 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, * @c: Execution context * @ref: epoll reference of listening socket * @conn: connection structure to initialize - * @s: Accepted socket + * @s0: Accepted (side 0) socket * @sa: Peer address of connection * * Return: true if able to create a spliced connection, false otherwise @@ -416,25 +416,25 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, */ bool tcp_splice_conn_from_sock(const struct ctx *c, union tcp_listen_epoll_ref ref, - struct tcp_splice_conn *conn, int s, + struct tcp_splice_conn *conn, int s0, const struct sockaddr *sa) { - union inany_addr aany; - in_port_t port; + union inany_addr src; + in_port_t srcport; ASSERT(c->mode == MODE_PASTA); - inany_from_sockaddr(&aany, &port, sa); - if (!inany_is_loopback(&aany)) + inany_from_sockaddr(&src, &srcport, sa); + if (!inany_is_loopback(&src)) return false; - conn->flags = inany_v4(&aany) ? 0 : SPLICE_V6; + conn->flags = inany_v4(&src) ? 0 : SPLICE_V6; - if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) - flow_trace(conn, "failed to set TCP_QUICKACK on %i", s); + if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) + flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); conn->f.type = FLOW_TCP_SPLICE; - conn->s[0] = s; + conn->s[0] = s0; conn->s[1] = -1; conn->pipe[0][0] = conn->pipe[0][1] = -1; conn->pipe[1][0] = conn->pipe[1][1] = -1; diff --git a/tcp_splice.h b/tcp_splice.h index 18193e4e..d3600807 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -12,7 +12,7 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); bool tcp_splice_conn_from_sock(const struct ctx *c, union tcp_listen_epoll_ref ref, - struct tcp_splice_conn *conn, int s, + struct tcp_splice_conn *conn, int s0, const struct sockaddr *sa); void tcp_splice_init(struct ctx *c); -- 2.43.0
Both the name and description are wrong. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp_splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcp_splice.c b/tcp_splice.c index bdabe411..eee98a30 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -319,7 +319,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, * tcp_splice_connect() - Create and connect socket for new spliced connection * @c: Execution context * @conn: Connection pointer - * @s: Accepted socket + * @sock_conn: Connectable socket * @port: Destination port, host order * * Return: 0 for connect() succeeded or in progress, negative value on error -- 2.43.0
The only caller of tcp_splice_new() is tcp_splice_conn_from_sock(). Both are quite short, and the division of responsibilities between the two isn't particularly obvious. Simplify by merging the former into the latter. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp_splice.c | 60 ++++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 37 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index eee98a30..30b4d58a 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -368,41 +368,6 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, return 0; } -/** - * tcp_splice_new() - Handle new spliced connection - * @c: Execution context - * @conn: Connection pointer - * @port: Destination port, host order - * @pif: Originating pif of the splice - * - * Return: return code from connect() - */ -static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, - in_port_t dstport, uint8_t pif) -{ - sa_family_t af = CONN_V6(conn) ? AF_INET6 : AF_INET; - int s1; - - if (pif == PIF_SPLICE) { - dstport += c->tcp.fwd_out.delta[dstport]; - - s1 = tcp_conn_sock(c, af); - } else { - ASSERT(pif == PIF_HOST); - - dstport += c->tcp.fwd_in.delta[dstport]; - - s1 = tcp_conn_sock_ns(c, af); - } - - if (s1 < 0) { - warn("Couldn't open connectable socket for splice (%d)", s1); - return s1; - } - - return tcp_splice_connect(c, conn, s1, dstport); -} - /** * tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection * @c: Execution context @@ -419,8 +384,10 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, struct tcp_splice_conn *conn, int s0, const struct sockaddr *sa) { + in_port_t srcport, dstport = ref.port; union inany_addr src; - in_port_t srcport; + sa_family_t af; + int s1; ASSERT(c->mode == MODE_PASTA); @@ -429,6 +396,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, return false; conn->flags = inany_v4(&src) ? 0 : SPLICE_V6; + af = CONN_V6(conn) ? AF_INET6 : AF_INET; if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); @@ -439,7 +407,25 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, conn->pipe[0][0] = conn->pipe[0][1] = -1; conn->pipe[1][0] = conn->pipe[1][1] = -1; - if (tcp_splice_new(c, conn, ref.port, ref.pif)) + if (ref.pif == PIF_SPLICE) { + dstport += c->tcp.fwd_out.delta[dstport]; + + s1 = tcp_conn_sock(c, af); + } else { + ASSERT(ref.pif == PIF_HOST); + + dstport += c->tcp.fwd_in.delta[dstport]; + + s1 = tcp_conn_sock_ns(c, af); + } + + if (s1 < 0) { + warn("Couldn't open connectable socket for splice (%d)", s1); + conn_flag(c, conn, CLOSING); + return true; + } + + if (tcp_splice_connect(c, conn, s1, dstport)) conn_flag(c, conn, CLOSING); return true; -- 2.43.0
This makes a number of changes to improve error reporting while connecting a new spliced socket: * We use flow_err() and similar functions so all messages include info on which specific flow was affected * We use strerror() to interpret raw error values * We now report errors on connection (at "trace" level, since this would allow spamming the logs) * We also look up and report some details on EPOLLERR events, which can include connection errors, since we use a non-blocking connect(). Again we use "trace" level since this can spam the logs. * Use the 'goto fail' idiom in tcp_splice_conn_from_sock to combine some common handling Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp_splice.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/tcp_splice.c b/tcp_splice.c index 30b4d58a..abd698d4 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -387,7 +387,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, in_port_t srcport, dstport = ref.port; union inany_addr src; sa_family_t af; - int s1; + int s1, rc; ASSERT(c->mode == MODE_PASTA); @@ -420,15 +420,25 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, } if (s1 < 0) { - warn("Couldn't open connectable socket for splice (%d)", s1); - conn_flag(c, conn, CLOSING); - return true; + flow_err(conn, + "Couldn't open connectable socket for splice: %s", + strerror(-s1)); + goto fail; } - if (tcp_splice_connect(c, conn, s1, dstport)) - conn_flag(c, conn, CLOSING); + if ((rc = tcp_splice_connect(c, conn, s1, dstport))) { + flow_trace(conn, "Couldn't connect socket for splice: %s", + strerror(-rc)); + goto fail; + } return true; + +fail: + /* This is *supposed* to be a spliced connection, but something went + * wrong */ + conn_flag(c, conn, CLOSING); + return true; } /** @@ -452,8 +462,20 @@ void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, if (conn->events == SPLICE_CLOSED) return; - if (events & EPOLLERR) + if (events & EPOLLERR) { + int err, rc; + socklen_t sl = sizeof(err); + + rc = getsockopt(ref.fd, SOL_SOCKET, SO_ERROR, &err, &sl); + if (rc) + flow_err(conn, "Error retrieving SO_ERROR: %s", + strerror(errno)); + else + flow_trace(conn, "Error event on socket: %s", + strerror(err)); + goto close; + } if (conn->events == SPLICE_CONNECT) { if (!(events & EPOLLOUT)) -- 2.43.0
Add this helper to format an inany into either IPv4 or IPv6 text format as appropriate. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- Makefile | 6 +++--- inany.c | 35 +++++++++++++++++++++++++++++++++++ inany.h | 4 ++++ 3 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 inany.c diff --git a/Makefile b/Makefile index af4fa87e..1c709229 100644 --- a/Makefile +++ b/Makefile @@ -45,9 +45,9 @@ FLAGS += -DVERSION=\"$(VERSION)\" FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c icmp.c \ - igmp.c isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c \ - passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c udp.c \ - util.c + igmp.c inany.c isolation.c lineread.c log.c mld.c ndp.c netlink.c \ + packet.c passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c \ + tcp_splice.c udp.c util.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) diff --git a/inany.c b/inany.c new file mode 100644 index 00000000..edf0b055 --- /dev/null +++ b/inany.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson <david(a)gibson.dropbear.id.au> + * + * inany.c - Types and helpers for handling addresses which could be + * IPv6 or IPv4 (encoded as IPv4-mapped IPv6 addresses) + */ + +#include <stdlib.h> +#include <stdbool.h> +#include <assert.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include "util.h" +#include "siphash.h" +#include "inany.h" + +/** inany_ntop - Convert an IPv[46] address to text format + * @src: IPv[46] address + * @dst: output buffer, minimum INANY_ADDRSTRLEN bytes + * @size: size of buffer at @dst + * + * Return: On success, a non-null pointer to @dst, NULL on failure + */ +/* cppcheck-suppress unusedFunction */ +const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size) +{ + const struct in_addr *v4 = inany_v4(src); + + if (v4) + return inet_ntop(AF_INET, v4, dst, size); + + return inet_ntop(AF_INET6, &src->a6, dst, size); +} diff --git a/inany.h b/inany.h index d758f7ba..d08d1f33 100644 --- a/inany.h +++ b/inany.h @@ -125,4 +125,8 @@ static inline void inany_siphash_feed(struct siphash_state *state, siphash_feed(state, (uint64_t)aa->u32[2] << 32 | aa->u32[3]); } +#define INANY_ADDRSTRLEN MAX(INET_ADDRSTRLEN, INET6_ADDRSTRLEN) + +const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size); + #endif /* INANY_H */ -- 2.43.0
This makes several tweaks to improve the logic which decides whether we're able to use the splice method for a new connection. * Rather than only calling tcp_splice_conn_from_sock() in pasta mode, we check for pasta mode within it, better localising the checks. * Previously if we got a connection from a non-loopback address we'd always fall back to the "tap" path, even if the connection was on a socket in the namespace. If we did get a non-loopback address on a namespace socket, something has gone wrong and the "tap" path certainly won't be able to handle it. Report the error and close, rather than passing it along to tap. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- inany.c | 1 - tcp.c | 3 +-- tcp_splice.c | 49 ++++++++++++++++++++++++++++++++----------------- 3 files changed, 33 insertions(+), 20 deletions(-) diff --git a/inany.c b/inany.c index edf0b055..eaf2755d 100644 --- a/inany.c +++ b/inany.c @@ -23,7 +23,6 @@ * * Return: On success, a non-null pointer to @dst, NULL on failure */ -/* cppcheck-suppress unusedFunction */ const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size) { const struct in_addr *v4 = inany_v4(src); diff --git a/tcp.c b/tcp.c index 8daefe99..052bf7cb 100644 --- a/tcp.c +++ b/tcp.c @@ -2733,8 +2733,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, if (s < 0) goto cancel; - if (c->mode == MODE_PASTA && - tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, + if (tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, s, (struct sockaddr *)&sa)) return; diff --git a/tcp_splice.c b/tcp_splice.c index abd698d4..3b438313 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -389,36 +389,51 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, sa_family_t af; int s1, rc; - ASSERT(c->mode == MODE_PASTA); + if (c->mode != MODE_PASTA) + return false; inany_from_sockaddr(&src, &srcport, sa); - if (!inany_is_loopback(&src)) - return false; + af = inany_v4(&src) ? AF_INET : AF_INET6; - conn->flags = inany_v4(&src) ? 0 : SPLICE_V6; - af = CONN_V6(conn) ? AF_INET6 : AF_INET; + switch (ref.pif) { + case PIF_SPLICE: + if (!inany_is_loopback(&src)) { + char str[INANY_ADDRSTRLEN]; - if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) - flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); + flow_err(conn, "Bad source address %s for splice, closing", + inany_ntop(&src, str, sizeof(str))); - conn->f.type = FLOW_TCP_SPLICE; - conn->s[0] = s0; - conn->s[1] = -1; - conn->pipe[0][0] = conn->pipe[0][1] = -1; - conn->pipe[1][0] = conn->pipe[1][1] = -1; + /* We *don't* want to fall back to tap */ + flow_alloc_cancel((union flow *)conn); + return true; + } - if (ref.pif == PIF_SPLICE) { dstport += c->tcp.fwd_out.delta[dstport]; - s1 = tcp_conn_sock(c, af); - } else { - ASSERT(ref.pif == PIF_HOST); + break; - dstport += c->tcp.fwd_in.delta[dstport]; + case PIF_HOST: + if (!inany_is_loopback(&src)) + return false; + dstport += c->tcp.fwd_in.delta[dstport]; s1 = tcp_conn_sock_ns(c, af); + break; + + default: + return false; } + conn->f.type = FLOW_TCP_SPLICE; + conn->s[0] = s0; + conn->s[1] = -1; + conn->pipe[0][0] = conn->pipe[0][1] = -1; + conn->pipe[1][0] = conn->pipe[1][1] = -1; + conn->flags = af == AF_INET ? 0 : SPLICE_V6; + + if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) + flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); + if (s1 < 0) { flow_err(conn, "Couldn't open connectable socket for splice: %s", -- 2.43.0
For various reasons, the standard library doesn't provide IPv4 loopback and unspecified addresses as struct in_addr constants, the way it does in6addr_loopback and in6addr_any for IPv6. This lack means that in several places we initialise a local with a macro just so that we can take the address of an IPv4 loopback or unspecified address. Provide our own in4addr_any and in4addr_loopback global constants to make this a bit simpler. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 4 ++-- udp.c | 5 ++--- util.c | 3 +++ util.h | 2 ++ 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tcp.c b/tcp.c index 052bf7cb..bb914bff 100644 --- a/tcp.c +++ b/tcp.c @@ -2944,12 +2944,12 @@ static void tcp_ns_sock_init4(const struct ctx *c, in_port_t port) .port = port, .pif = PIF_SPLICE, }; - struct in_addr loopback = IN4ADDR_LOOPBACK_INIT; int s; ASSERT(c->mode == MODE_PASTA); - s = sock_l4(c, AF_INET, IPPROTO_TCP, &loopback, NULL, port, tref.u32); + s = sock_l4(c, AF_INET, IPPROTO_TCP, &in4addr_loopback, NULL, port, + tref.u32); if (s >= 0) tcp_sock_set_bufsize(c, s); else diff --git a/udp.c b/udp.c index 02cb7889..1104f374 100644 --- a/udp.c +++ b/udp.c @@ -1012,9 +1012,8 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, udp_tap_map[V4][uref.port].sock = s < 0 ? -1 : s; udp_splice_init[V4][port].sock = s < 0 ? -1 : s; } else { - struct in_addr loopback = IN4ADDR_LOOPBACK_INIT; - - r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, &loopback, + r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, + &in4addr_loopback, ifname, port, uref.u32); udp_splice_ns[V4][port].sock = s < 0 ? -1 : s; } diff --git a/util.c b/util.c index 8acce233..f1f68a03 100644 --- a/util.c +++ b/util.c @@ -30,6 +30,9 @@ #include "packet.h" #include "log.h" +const struct in_addr in4addr_loopback = IN4ADDR_LOOPBACK_INIT; +const struct in_addr in4addr_any = IN4ADDR_ANY_INIT; + #define IPV6_NH_OPT(nh) \ ((nh) == 0 || (nh) == 43 || (nh) == 44 || (nh) == 50 || \ (nh) == 51 || (nh) == 60 || (nh) == 135 || (nh) == 139 || \ diff --git a/util.h b/util.h index e0df26c6..4a71904b 100644 --- a/util.h +++ b/util.h @@ -125,6 +125,8 @@ #define IN4ADDR_ANY_INIT \ { .s_addr = htonl_constant(INADDR_ANY) } +extern const struct in_addr in4addr_loopback; +extern const struct in_addr in4addr_any; #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, -- 2.43.0
There are a number of places where we want to handle either a sockaddr_in or a sockaddr_in6. In some of those we use a void *, which works ok and matches some standard library interfaces, but doesn't give a signature level hint that we're dealing with only sockaddr_in or sockaddr_in6, not (say) sockaddr_un or another type of socket address. Other places we use a sockaddr_storage, which also works, but has the same problem in addition to allocating more on the stack than we need to. Introduce union sockaddr_inany to explictly handle this case: it has variants for sockaddr_in and sockaddr_in6. Use it in a number of places where it's easy to do so. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 18 ++++++------------ inany.h | 33 +++++++++++++++++++++------------ tcp.c | 11 +++++------ tcp_splice.c | 2 +- tcp_splice.h | 3 ++- 5 files changed, 35 insertions(+), 32 deletions(-) diff --git a/icmp.c b/icmp.c index faa38c81..fb2fcafc 100644 --- a/icmp.c +++ b/icmp.c @@ -36,6 +36,8 @@ #include "passt.h" #include "tap.h" #include "log.h" +#include "siphash.h" +#include "inany.h" #include "icmp.h" #define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ @@ -67,13 +69,9 @@ void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref) struct icmp_id_sock *const id_sock = af == AF_INET ? &icmp_id_map[V4][ref.icmp.id] : &icmp_id_map[V6][ref.icmp.id]; const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; - char buf[USHRT_MAX]; - union { - struct sockaddr sa; - struct sockaddr_in sa4; - struct sockaddr_in6 sa6; - } sr; + union sockaddr_inany sr; socklen_t sl = sizeof(sr); + char buf[USHRT_MAX]; uint16_t seq; ssize_t n; @@ -86,7 +84,7 @@ void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref) pname, strerror(errno)); return; } - if (sr.sa.sa_family != af) + if (sr.sa_family != af) goto unexpected; if (af == AF_INET) { @@ -214,11 +212,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const struct pool *p, const struct timespec *now) { const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; - union { - struct sockaddr sa; - struct sockaddr_in sa4; - struct sockaddr_in6 sa6; - } sa = { .sa.sa_family = af }; + union sockaddr_inany sa = { .sa_family = af }; const socklen_t sl = af == AF_INET ? sizeof(sa.sa4) : sizeof(sa.sa6); struct icmp_id_sock *id_sock; uint16_t id, seq; diff --git a/inany.h b/inany.h index d08d1f33..474e09d0 100644 --- a/inany.h +++ b/inany.h @@ -9,6 +9,8 @@ #ifndef INANY_H #define INANY_H +struct siphash_state; + /** union inany_addr - Represents either an IPv4 or IPv6 address * @a6: Address as an IPv6 address, may be IPv4-mapped * @v4mapped.zero: All zero-bits for an IPv4 address @@ -32,6 +34,19 @@ static_assert(sizeof(union inany_addr) == sizeof(struct in6_addr), static_assert(_Alignof(union inany_addr) == _Alignof(uint32_t), "union inany_addr has unexpected alignment"); +/** union sockaddr_inany - Either a sockaddr_in or a sockaddr_in6 + * @sa_family: Address family, AF_INET or AF_INET6 + * @sa: Plain struct sockaddr (useful to avoid casts) + * @sa4: IPv4 socket address, valid if sa_family == AF_INET + * @sa6: IPv6 socket address, valid if sa_family == AF_INET6 + */ +union sockaddr_inany { + sa_family_t sa_family; + struct sockaddr sa; + struct sockaddr_in sa4; + struct sockaddr_in6 sa6; +}; + /** inany_v4 - Extract IPv4 address, if present, from IPv[46] address * @addr: IPv4 or IPv6 address * @@ -91,23 +106,17 @@ static inline void inany_from_af(union inany_addr *aa, /** inany_from_sockaddr - Extract IPv[46] address and port number from sockaddr * @aa: Pointer to store IPv[46] address * @port: Pointer to store port number, host order - * @addr: struct sockaddr_in (IPv4) or struct sockaddr_in6 (IPv6) + * @addr: AF_INET or AF_INET6 socket address */ static inline void inany_from_sockaddr(union inany_addr *aa, in_port_t *port, - const void *addr) + const union sockaddr_inany *sa) { - const struct sockaddr *sa = (const struct sockaddr *)addr; - if (sa->sa_family == AF_INET6) { - struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa; - - inany_from_af(aa, AF_INET6, &sa6->sin6_addr); - *port = ntohs(sa6->sin6_port); + inany_from_af(aa, AF_INET6, &sa->sa6.sin6_addr); + *port = ntohs(sa->sa6.sin6_port); } else if (sa->sa_family == AF_INET) { - struct sockaddr_in *sa4 = (struct sockaddr_in *)sa; - - inany_from_af(aa, AF_INET, &sa4->sin_addr); - *port = ntohs(sa4->sin_port); + inany_from_af(aa, AF_INET, &sa->sa4.sin_addr); + *port = ntohs(sa->sa4.sin_port); } else { /* Not valid to call with other address families */ ASSERT(0); diff --git a/tcp.c b/tcp.c index bb914bff..a52a1f84 100644 --- a/tcp.c +++ b/tcp.c @@ -2685,7 +2685,7 @@ static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr) static void tcp_tap_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref, struct tcp_tap_conn *conn, int s, - const struct sockaddr *sa, + const union sockaddr_inany *sa, const struct timespec *now) { conn->f.type = FLOW_TCP; @@ -2721,7 +2721,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, void tcp_listen_handler(struct ctx *c, union epoll_ref ref, const struct timespec *now) { - struct sockaddr_storage sa; + union sockaddr_inany sa; socklen_t sl = sizeof(sa); union flow *flow; int s; @@ -2729,16 +2729,15 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, if (c->no_tcp || !(flow = flow_alloc())) return; - s = accept4(ref.fd, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); + s = accept4(ref.fd, &sa.sa, &sl, SOCK_NONBLOCK); if (s < 0) goto cancel; if (tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, - s, (struct sockaddr *)&sa)) + s, &sa)) return; - tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, - (struct sockaddr *)&sa, now); + tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, &sa, now); return; cancel: diff --git a/tcp_splice.c b/tcp_splice.c index 3b438313..3a2c0781 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -382,7 +382,7 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, bool tcp_splice_conn_from_sock(const struct ctx *c, union tcp_listen_epoll_ref ref, struct tcp_splice_conn *conn, int s0, - const struct sockaddr *sa) + const union sockaddr_inany *sa) { in_port_t srcport, dstport = ref.port; union inany_addr src; diff --git a/tcp_splice.h b/tcp_splice.h index d3600807..f6557cd4 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -7,13 +7,14 @@ #define TCP_SPLICE_H struct tcp_splice_conn; +union sockaddr_inany; void tcp_splice_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); bool tcp_splice_conn_from_sock(const struct ctx *c, union tcp_listen_epoll_ref ref, struct tcp_splice_conn *conn, int s0, - const struct sockaddr *sa); + const union sockaddr_inany *sa); void tcp_splice_init(struct ctx *c); #endif /* TCP_SPLICE_H */ -- 2.43.0
In both tcp_conn_from_tap() and tcp_splice_connect() we need to construct a socket address for connect() which could be either IPv4 or IPv6. At the moment we initialise both a sockaddr_in and a sockaddr_in6 as locals, then set a pointer to one or the other. This is a little bit ugly. More importantly, though, in the case of tcp_conn_from_tap() initialising the sockaddr_in6 when we're actually passed an IPv4 address will access memory beyond the implied (struct in_addr) we're passed as daddr. In practice that will be a pointer into a packet buffer, so there will be enough valid memory to get 16 bytes of (garbage) IPv6 address that are then ignored. However, it's not a good look to access beyond what the parameters seem to imply is passed. We can clean up these cases using sockaddr_inany and a new helper sockaddr_inany_init(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- inany.h | 33 +++++++++++++++++++++++++++++++++ tcp.c | 37 +++++++++++-------------------------- tcp_splice.c | 27 ++++++++------------------- 3 files changed, 52 insertions(+), 45 deletions(-) diff --git a/inany.h b/inany.h index 474e09d0..063545b7 100644 --- a/inany.h +++ b/inany.h @@ -138,4 +138,37 @@ static inline void inany_siphash_feed(struct siphash_state *state, const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size); +/** sockaddr_inany_init - Construct a sockaddr_inany + * @sa: Pointer to sockaddr to fill in + * @sl: Relevant length of @sa after initialisation + * @af: Address family, AF_INET or AF_INET6 + * @addr: Address (either in_addr or in6_addr) + * @port: Port (host byte order) + * @scope: Scope ID for AF_INET6 (ignored for AF_INET) + */ +static inline void sockaddr_inany_init(union sockaddr_inany *sa, socklen_t *sl, + sa_family_t af, const void *addr, + in_port_t port, uint32_t scope) +{ + sa->sa_family = af; + switch (af) { + case AF_INET: + sa->sa4.sin_addr = *(const struct in_addr *)addr; + sa->sa4.sin_port = htons(port); + *sl = sizeof(sa->sa4); + break; + + case AF_INET6: + sa->sa6.sin6_addr = *(const struct in6_addr *)addr; + sa->sa6.sin6_port = htons(port); + sa->sa6.sin6_scope_id = scope; + sa->sa6.sin6_flowinfo = 0; + *sl = sizeof(sa->sa6); + break; + + default: + ASSERT(0); + } +} + #endif /* INANY_H */ diff --git a/tcp.c b/tcp.c index a52a1f84..6c9edbe1 100644 --- a/tcp.c +++ b/tcp.c @@ -1930,18 +1930,9 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) { - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = th->dest, - .sin_addr = *(struct in_addr *)daddr, - }; - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = th->dest, - .sin6_addr = *(struct in6_addr *)daddr, - }; - const struct sockaddr *sa; + const void *host_daddr = daddr; struct tcp_tap_conn *conn; + union sockaddr_inany sa; union flow *flow; socklen_t sl; int s, mss; @@ -1956,12 +1947,12 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, if (!c->no_map_gw) { if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw)) - addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + host_daddr = &in4addr_loopback; if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw)) - addr6.sin6_addr = in6addr_loopback; + host_daddr = &in6addr_loopback; } - if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) { + if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(host_daddr)) { struct sockaddr_in6 addr6_ll = { .sin6_family = AF_INET6, .sin6_addr = c->ip6.addr_ll, @@ -1994,13 +1985,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, inany_from_af(&conn->faddr, af, daddr); - if (af == AF_INET) { - sa = (struct sockaddr *)&addr4; - sl = sizeof(addr4); - } else { - sa = (struct sockaddr *)&addr6; - sl = sizeof(addr6); - } + sockaddr_inany_init(&sa, &sl, af, host_daddr, ntohs(th->dest), 0); conn->fport = ntohs(th->dest); conn->eport = ntohs(th->source); @@ -2014,19 +1999,19 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, tcp_hash_insert(c, conn); - if (!bind(s, sa, sl)) { + if (!bind(s, &sa.sa, sl)) { tcp_rst(c, conn); /* Nobody is listening then */ return; } if (errno != EADDRNOTAVAIL && errno != EACCES) conn_flag(c, conn, LOCAL); - if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) || - (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) && - !IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr))) + if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&sa.sa4.sin_addr)) || + (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&sa.sa6.sin6_addr) && + !IN6_IS_ADDR_LINKLOCAL(&sa.sa6.sin6_addr))) tcp_bind_outbound(c, s, af); - if (connect(s, sa, sl)) { + if (connect(s, &sa.sa, sl)) { if (errno != EINPROGRESS) { tcp_rst(c, conn); return; diff --git a/tcp_splice.c b/tcp_splice.c index 3a2c0781..20f56ac3 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -327,17 +327,7 @@ static int tcp_splice_connect_finish(const struct ctx *c, static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, int sock_conn, in_port_t port) { - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(port), - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - }; - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(port), - .sin_addr = IN4ADDR_LOOPBACK_INIT, - }; - const struct sockaddr *sa; + union sockaddr_inany sa; socklen_t sl; conn->s[1] = sock_conn; @@ -348,15 +338,14 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, conn->s[1]); } - if (CONN_V6(conn)) { - sa = (struct sockaddr *)&addr6; - sl = sizeof(addr6); - } else { - sa = (struct sockaddr *)&addr4; - sl = sizeof(addr4); - } + if (CONN_V6(conn)) + sockaddr_inany_init(&sa, &sl, + AF_INET6, &in6addr_loopback, port, 0); + else + sockaddr_inany_init(&sa, &sl, + AF_INET, &in4addr_loopback, port, 0); - if (connect(conn->s[1], sa, sl)) { + if (connect(conn->s[1], &sa.sa, sl)) { if (errno != EINPROGRESS) return -errno; conn_event(c, conn, SPLICE_CONNECT); -- 2.43.0
inany_from_af() can be used to initialise a union inany_addr from either an IPv4 or IPv6 address. If we want to set it to the unspecified IPv4 or IPv6 address we can do that, but need to explicitly pass the correct address version. Make this easier by allowing it to interpret a NULL address as the unspecified address of the appropriate family. We only have one trivial use for this at present, but it will have more uses in future. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- inany.h | 10 ++++++++-- tcp.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/inany.h b/inany.h index 063545b7..70237821 100644 --- a/inany.h +++ b/inany.h @@ -92,11 +92,17 @@ static inline void inany_from_af(union inany_addr *aa, sa_family_t af, const void *addr) { if (af == AF_INET6) { - aa->a6 = *((struct in6_addr *)addr); + if (addr) + aa->a6 = *((struct in6_addr *)addr); + else + aa->a6 = in6addr_any; } else if (af == AF_INET) { memset(&aa->v4mapped.zero, 0, sizeof(aa->v4mapped.zero)); memset(&aa->v4mapped.one, 0xff, sizeof(aa->v4mapped.one)); - aa->v4mapped.a4 = *((struct in_addr *)addr); + if (addr) + aa->v4mapped.a4 = *((struct in_addr *)addr); + else + aa->v4mapped.a4 = in4addr_any; } else { /* Not valid to call with other address families */ ASSERT(0); diff --git a/tcp.c b/tcp.c index 6c9edbe1..1326584e 100644 --- a/tcp.c +++ b/tcp.c @@ -883,7 +883,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, low_rtt_dst[hole++] = conn->faddr; if (hole == LOW_RTT_TABLE_SIZE) hole = 0; - inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); + inany_from_af(low_rtt_dst + hole, AF_INET6, NULL); #else (void)conn; (void)tinfo; -- 2.43.0
On Mon, Jan 29, 2024 at 03:35:41PM +1100, David Gibson wrote:Here's another batch of cleanups and tweaks in preparation for the flow table. This set focuses on improved helpers for handling addresses, particularly in the TCP splice path.Sorry Stefano, I jumped the gun a bit here. Feel free to do a review if you have the time, but don't apply yet. I have some changes that still need to be made here. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson