This is a first draft of the first part of implementing a more general flow table (connection tracking) as described at: https://pad.passt.top/p/NewForwardingModel This is by no means complete. So far it doesn't really do anything new, it just reorganizes the TCP connection table to be closer to the more general flow table. Still it's ready for preliminary review. David Gibson (8): tap: Don't clobber source address in tap6_handler() tap: Pass source address to protocol handler functions tcp: More precise terms for addresses and ports tcp, udp: Don't include destination address in partially precomputed csums tcp, udp: Don't pre-fill IPv4 destination address in headers tcp: Track guest-side correspondent address tcp, flow: Introduce struct demiflow tcp, flow: Perform TCP hash calculations based on demiflow structure flow.h | 66 +++++++++++++++ icmp.c | 12 ++- icmp.h | 3 +- passt.c | 10 +-- passt.h | 4 +- pasta.c | 2 +- siphash.c | 1 + tap.c | 29 ++++--- tcp.c | 227 ++++++++++++++++++++------------------------------- tcp.h | 5 +- tcp_conn.h | 9 +- tcp_splice.c | 2 + udp.c | 37 +++------ udp.h | 5 +- util.h | 4 +- 15 files changed, 209 insertions(+), 207 deletions(-) create mode 100644 flow.h -- 2.41.0
In tap6_handler() saddr is initialized to the IPv6 source address from the incoming packet. However part way through, but before organizing the packet into a "sequence" we set it unconditionally to the guest's assigned address. We don't do anything equivalent for IPv4. This doesn't make a lot of sense: if the guest is using a different source address it makes sense to consider these different sequences of packets and we shouldn't try to combine them together. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tap.c b/tap.c index a6a73d3..d8aa633 100644 --- a/tap.c +++ b/tap.c @@ -817,8 +817,6 @@ resume: continue; } - *saddr = c->ip6.addr; - if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); continue; -- 2.41.0
The tap code passes the IPv4 or IPv6 destination address of packets it receives to the protocol specific code. Currently that protocol code doesn't use the source address, but we want it to in future. So, in preparation, pass the IPv4/IPv6 source address of tap packets to those functions as well. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 12 ++++++++---- icmp.h | 3 ++- tap.c | 19 +++++++++++-------- tcp.c | 28 +++++++++++++++++----------- tcp.h | 2 +- udp.c | 14 ++++++++------ udp.h | 2 +- 7 files changed, 48 insertions(+), 32 deletions(-) diff --git a/icmp.c b/icmp.c index 44f73c8..65e3ec9 100644 --- a/icmp.c +++ b/icmp.c @@ -138,17 +138,21 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref, * icmp_tap_handler() - Handle packets from tap * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @addr: Destination address + * @saddr: Source address + * @daddr: Destination address * @p: Packet pool, single packet with ICMP/ICMPv6 header * @now: Current timestamp * * Return: count of consumed packets (always 1, even if malformed) */ -int icmp_tap_handler(const struct ctx *c, int af, const void *addr, +int icmp_tap_handler(const struct ctx *c, int af, + const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now) { size_t plen; + (void)saddr; + if (af == AF_INET) { union icmp_epoll_ref iref = { .icmp.v6 = 0 }; struct sockaddr_in sa = { @@ -194,7 +198,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr, icmp_id_map[V4][id].ts = now->tv_sec; bitmap_set(icmp_act[V4], id); - sa.sin_addr = *(struct in_addr *)addr; + sa.sin_addr = *(struct in_addr *)daddr; if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, (struct sockaddr *)&sa, sizeof(sa)) < 0) { debug("ICMP: failed to relay request to socket"); @@ -248,7 +252,7 @@ int icmp_tap_handler(const struct ctx *c, int af, const void *addr, icmp_id_map[V6][id].ts = now->tv_sec; bitmap_set(icmp_act[V6], id); - sa.sin6_addr = *(struct in6_addr *)addr; + sa.sin6_addr = *(struct in6_addr *)daddr; if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, (struct sockaddr *)&sa, sizeof(sa)) < 1) { debug("ICMPv6: failed to relay request to socket"); diff --git a/icmp.h b/icmp.h index f899e6d..231f6f0 100644 --- a/icmp.h +++ b/icmp.h @@ -12,7 +12,8 @@ struct ctx; void icmp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); -int icmp_tap_handler(const struct ctx *c, int af, const void *addr, +int icmp_tap_handler(const struct ctx *c, + int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now); void icmp_timer(const struct ctx *c, const struct timespec *ts); void icmp_init(void); diff --git a/tap.c b/tap.c index d8aa633..5e1daf8 100644 --- a/tap.c +++ b/tap.c @@ -642,7 +642,8 @@ resume: tap_packet_debug(iph, NULL, NULL, 0, NULL, 1); packet_add(pkt, l4_len, l4h); - icmp_tap_handler(c, AF_INET, &iph->daddr, pkt, now); + icmp_tap_handler(c, AF_INET, &iph->saddr, &iph->daddr, + pkt, now); continue; } @@ -707,7 +708,6 @@ append: for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) { struct pool *p = (struct pool *)&seq->p; - struct in_addr *da = &seq->daddr; size_t n = p->count; tap_packet_debug(NULL, NULL, seq, 0, NULL, n); @@ -715,11 +715,13 @@ append: if (seq->protocol == IPPROTO_TCP) { if (c->no_tcp) continue; - while ((n -= tcp_tap_handler(c, AF_INET, da, p, now))); + while ((n -= tcp_tap_handler(c, AF_INET, &seq->saddr, + &seq->daddr, p, now))); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; - while ((n -= udp_tap_handler(c, AF_INET, da, p, now))); + while ((n -= udp_tap_handler(c, AF_INET, &seq->saddr, + &seq->daddr, p, now))); } } @@ -800,7 +802,7 @@ resume: tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); packet_add(pkt, l4_len, l4h); - icmp_tap_handler(c, AF_INET6, daddr, pkt, now); + icmp_tap_handler(c, AF_INET6, saddr, daddr, pkt, now); continue; } @@ -867,7 +869,6 @@ append: for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) { struct pool *p = (struct pool *)&seq->p; - struct in6_addr *da = &seq->daddr; size_t n = p->count; tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n); @@ -875,11 +876,13 @@ append: if (seq->protocol == IPPROTO_TCP) { if (c->no_tcp) continue; - while ((n -= tcp_tap_handler(c, AF_INET6, da, p, now))); + while ((n -= tcp_tap_handler(c, AF_INET6, &seq->saddr, + &seq->daddr, p, now))); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; - while ((n -= udp_tap_handler(c, AF_INET6, da, p, now))); + while ((n -= udp_tap_handler(c, AF_INET6, &seq->saddr, + &seq->daddr, p, now))); } } diff --git a/tcp.c b/tcp.c index 0ed9bfa..482c25b 100644 --- a/tcp.c +++ b/tcp.c @@ -2007,13 +2007,15 @@ static void tcp_bind_outbound(const struct ctx *c, int s, sa_family_t af) * tcp_conn_from_tap() - Handle connection request (SYN segment) from tap * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @addr: Remote address, pointer to in_addr or in6_addr + * @saddr: Source address, pointer to in_addr or in6_addr + * @daddr: Destination address, pointer to in_addr or in6_addr * @th: TCP header from tap: caller MUST ensure it's there * @opts: Pointer to start of options * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp */ -static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, +static void tcp_conn_from_tap(struct ctx *c, + int af, const void *saddr, const void *daddr, const struct tcphdr *th, const char *opts, size_t optlen, const struct timespec *now) { @@ -2021,18 +2023,20 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, struct sockaddr_in addr4 = { .sin_family = AF_INET, .sin_port = th->dest, - .sin_addr = *(struct in_addr *)addr, + .sin_addr = *(struct in_addr *)daddr, }; struct sockaddr_in6 addr6 = { .sin6_family = AF_INET6, .sin6_port = th->dest, - .sin6_addr = *(struct in6_addr *)addr, + .sin6_addr = *(struct in6_addr *)daddr, }; const struct sockaddr *sa; struct tcp_tap_conn *conn; socklen_t sl; int s, mss; + (void)saddr; + if (c->tcp.conn_count >= TCP_MAX_CONNS) return; @@ -2041,9 +2045,9 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, return; if (!c->no_map_gw) { - if (af == AF_INET && IN4_ARE_ADDR_EQUAL(addr, &c->ip4.gw)) + if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw)) addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(addr, &c->ip6.gw)) + if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw)) addr6.sin6_addr = in6addr_loopback; } @@ -2080,7 +2084,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) conn->wnd_from_tap = 1; - inany_from_af(&conn->addr, af, addr); + inany_from_af(&conn->addr, af, daddr); if (af == AF_INET) { sa = (struct sockaddr *)&addr4; @@ -2558,13 +2562,14 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, * tcp_tap_handler() - Handle packets from tap and state transitions * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @addr: Destination address + * @saddr: Source address + * @daddr: Destination address * @p: Pool of TCP packets, with TCP headers * @now: Current timestamp * * Return: count of consumed packets */ -int tcp_tap_handler(struct ctx *c, int af, const void *addr, +int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now) { struct tcp_tap_conn *conn; @@ -2585,12 +2590,13 @@ int tcp_tap_handler(struct ctx *c, int af, const void *addr, optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); opts = packet_get(p, 0, sizeof(*th), optlen, NULL); - conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest)); + conn = tcp_hash_lookup(c, af, daddr, htons(th->source), htons(th->dest)); /* New connection from tap */ if (!conn) { if (opts && th->syn && !th->ack) - tcp_conn_from_tap(c, af, addr, th, opts, optlen, now); + tcp_conn_from_tap(c, af, saddr, daddr, th, + opts, optlen, now); return 1; } diff --git a/tcp.h b/tcp.h index a9e3425..66a73eb 100644 --- a/tcp.h +++ b/tcp.h @@ -15,7 +15,7 @@ struct ctx; void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); -int tcp_tap_handler(struct ctx *c, int af, const void *addr, +int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now); int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, const char *ifname, in_port_t port); diff --git a/udp.c b/udp.c index 39c59d4..47b4a61 100644 --- a/udp.c +++ b/udp.c @@ -801,7 +801,8 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, * udp_tap_handler() - Handle packets from tap * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @addr: Destination address + * @saddr: Source address + * @daddr: Destination address * @p: Pool of UDP packets, with UDP headers * @now: Current timestamp * @@ -809,7 +810,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, * * #syscalls sendmmsg */ -int udp_tap_handler(struct ctx *c, int af, const void *addr, +int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now) { struct mmsghdr mm[UIO_MAXIOV]; @@ -823,6 +824,7 @@ int udp_tap_handler(struct ctx *c, int af, const void *addr, socklen_t sl; (void)c; + (void)saddr; uh = packet_get(p, 0, 0, sizeof(*uh), NULL); if (!uh) @@ -838,7 +840,7 @@ int udp_tap_handler(struct ctx *c, int af, const void *addr, s_in = (struct sockaddr_in) { .sin_family = AF_INET, .sin_port = uh->dest, - .sin_addr = *(struct in_addr *)addr, + .sin_addr = *(struct in_addr *)daddr, }; sa = (struct sockaddr *)&s_in; @@ -883,17 +885,17 @@ int udp_tap_handler(struct ctx *c, int af, const void *addr, s_in6 = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_port = uh->dest, - .sin6_addr = *(struct in6_addr *)addr, + .sin6_addr = *(struct in6_addr *)daddr, }; const struct in6_addr *bind_addr = &in6addr_any; sa = (struct sockaddr *)&s_in6; sl = sizeof(s_in6); - if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.dns_match) && + if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.dns_match) && ntohs(s_in6.sin6_port) == 53) { s_in6.sin6_addr = c->ip6.dns_host; - } else if (IN6_ARE_ADDR_EQUAL(addr, &c->ip6.gw) && + } else if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw) && !c->no_map_gw) { if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) || (udp_tap_map[V6][dst].flags & PORT_LOOPBACK)) diff --git a/udp.h b/udp.h index e64ef18..060ae35 100644 --- a/udp.h +++ b/udp.h @@ -10,7 +10,7 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now); -int udp_tap_handler(struct ctx *c, int af, const void *addr, +int udp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now); int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port); -- 2.41.0
In a number of places the comments and variable names we use to describe addresses and ports are ambiguous. It's not sufficient to describe a port as "tap-facing" or "socket-facing", because on both the tap side and the socket side there are two ports for the two ends of the connection. Similarly, "local" and "remote" aren't particularly helpful, because it's not necessarily clear whether we're talking from the point of view of the guest/namespace, the host, or passt itself. This patch makes a number of changes to be more precise about this. It introduces two new terms in aid of this: A "forwarding" address (or port) refers to an address which is local from the point of view of passt itself. That is a source address for traffic sent by passt, whether it's to the guest via the tap interface or to a host on the internet via a socket. The "correspondent" address (or port) is the reverse: a remote address from passt's point of view, the destination address for traffic sent by passt. Between them the "side" (either tap/guest-facing or sock/host-facing) and forwarding/correspondent unambiguously describes which address or port we're talking about. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 93 +++++++++++++++++++++++++++--------------------------- tcp_conn.h | 12 +++---- 2 files changed, 53 insertions(+), 52 deletions(-) diff --git a/tcp.c b/tcp.c index 482c25b..c1bfc4f 100644 --- a/tcp.c +++ b/tcp.c @@ -401,7 +401,7 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ #define OPT_SACK 5 #define OPT_TS 8 -#define CONN_V4(conn) (!!inany_v4(&(conn)->addr)) +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) #define CONN_V6(conn) (!CONN_V4(conn)) #define CONN_IS_CLOSING(conn) \ ((conn->events & ESTABLISHED) && \ @@ -434,7 +434,9 @@ static const char *tcp_flag_str[] __attribute((__unused__)) = { static int tcp_sock_init_ext [NUM_PORTS][IP_VERSIONS]; static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; -/* Table of destinations with very low RTT (assumed to be local), LRU */ +/* Table of guest side forwarding addresses with very low RTT (assumed + * to be local to the host), LRU + */ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; /* Static buffers */ @@ -860,7 +862,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (inany_equals(&conn->addr, low_rtt_dst + i)) + if (inany_equals(&conn->faddr, low_rtt_dst + i)) return 1; return 0; @@ -882,7 +884,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (inany_equals(&conn->addr, low_rtt_dst + i)) + if (inany_equals(&conn->faddr, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -894,7 +896,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, if (hole == -1) return; - low_rtt_dst[hole++] = conn->addr; + low_rtt_dst[hole++] = conn->faddr; if (hole == LOW_RTT_TABLE_SIZE) hole = 0; inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); @@ -1164,18 +1166,18 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, /** * tcp_hash_match() - Check if a connection entry matches address and ports * @conn: Connection entry to match against - * @addr: Remote address - * @tap_port: tap-facing port - * @sock_port: Socket-facing port + * @faddr: Guest side forwarding address + * @cport: Guest side correspondent port + * @fport: Guest side forwarding port * * Return: 1 on match, 0 otherwise */ static int tcp_hash_match(const struct tcp_tap_conn *conn, - const union inany_addr *addr, - in_port_t tap_port, in_port_t sock_port) + const union inany_addr *faddr, + in_port_t cport, in_port_t fport) { - if (inany_equals(&conn->addr, addr) && - conn->tap_port == tap_port && conn->sock_port == sock_port) + if (inany_equals(&conn->faddr, faddr) && + conn->cport == cport && conn->fport == fport) return 1; return 0; @@ -1184,21 +1186,21 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn, /** * tcp_hash() - Calculate hash value for connection given address and ports * @c: Execution context - * @addr: Remote address - * @tap_port: tap-facing port - * @sock_port: Socket-facing port + * @faddr: Guest side forwarding address + * @cport: Guest side correspondent port + * @fport: Guest side forwarding port * * Return: hash value, already modulo size of the hash table */ -static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *addr, - in_port_t tap_port, in_port_t sock_port) +static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr, + in_port_t cport, in_port_t fport) { struct { - union inany_addr addr; - in_port_t tap_port; - in_port_t sock_port; + union inany_addr faddr; + in_port_t cport; + in_port_t fport; } __attribute__((__packed__)) in = { - *addr, tap_port, sock_port + *faddr, cport, fport }; uint64_t b = 0; @@ -1217,7 +1219,7 @@ static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *addr, static unsigned int tcp_conn_hash(const struct ctx *c, const struct tcp_tap_conn *conn) { - return tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port); + return tcp_hash(c, &conn->faddr, conn->cport, conn->fport); } /** @@ -1229,7 +1231,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { int b; - b = tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port); + b = tcp_hash(c, &conn->faddr, conn->cport, conn->fport); conn->next_index = tc_hash[b] ? CONN_IDX(tc_hash[b]) : -1; tc_hash[b] = conn; @@ -1298,25 +1300,24 @@ static void tcp_tap_conn_update(struct ctx *c, struct tcp_tap_conn *old, * tcp_hash_lookup() - Look up connection given remote address and ports * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @addr: Remote address, pointer to in_addr or in6_addr - * @tap_port: tap-facing port - * @sock_port: Socket-facing port + * @faddr: Guest side forwarding address (guest remote address) + * @cport: Guest side correspondent port (guest local port) + * @fport: Guest side forwarding port (guest remote port) * * Return: connection pointer, if found, -ENOENT otherwise */ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, - int af, const void *addr, - in_port_t tap_port, - in_port_t sock_port) + int af, const void *faddr, + in_port_t cport, in_port_t fport) { union inany_addr aany; struct tcp_tap_conn *conn; int b; - inany_from_af(&aany, af, addr); - b = tcp_hash(c, &aany, tap_port, sock_port); + inany_from_af(&aany, af, faddr); + b = tcp_hash(c, &aany, cport, fport); for (conn = tc_hash[b]; conn; conn = conn_at_idx(conn->next_index)) { - if (tcp_hash_match(conn, &aany, tap_port, sock_port)) + if (tcp_hash_match(conn, &aany, cport, fport)) return conn; } @@ -1449,13 +1450,13 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, void *p, size_t plen, const uint16_t *check, uint32_t seq) { - const struct in_addr *a4 = inany_v4(&conn->addr); + const struct in_addr *a4 = inany_v4(&conn->faddr); size_t ip_len, tlen; #define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \ do { \ - b->th.source = htons(conn->sock_port); \ - b->th.dest = htons(conn->tap_port); \ + b->th.source = htons(conn->fport); \ + b->th.dest = htons(conn->cport); \ b->th.seq = htonl(seq); \ b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ if (conn->events & ESTABLISHED) { \ @@ -1491,7 +1492,7 @@ do { \ ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); - b->ip6h.saddr = conn->addr.a6; + b->ip6h.saddr = conn->faddr.a6; if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr)) b->ip6h.daddr = c->ip6.addr_ll_seen; else @@ -1844,7 +1845,7 @@ static void tcp_clamp_window(const struct ctx *c, struct tcp_tap_conn *conn, /** * tcp_seq_init() - Calculate initial sequence number according to RFC 6528 * @c: Execution context - * @conn: TCP connection, with addr, sock_port and tap_port populated + * @conn: TCP connection, with faddr, fport and cport populated * @now: Current timestamp */ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, @@ -1857,9 +1858,9 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, union inany_addr dst; in_port_t dstport; } __attribute__((__packed__)) in = { - .src = conn->addr, - .srcport = conn->tap_port, - .dstport = conn->sock_port, + .src = conn->faddr, + .srcport = conn->cport, + .dstport = conn->fport, }; uint32_t ns, seq = 0; @@ -2084,7 +2085,7 @@ static void tcp_conn_from_tap(struct ctx *c, if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) conn->wnd_from_tap = 1; - inany_from_af(&conn->addr, af, daddr); + inany_from_af(&conn->faddr, af, daddr); if (af == AF_INET) { sa = (struct sockaddr *)&addr4; @@ -2094,8 +2095,8 @@ static void tcp_conn_from_tap(struct ctx *c, sl = sizeof(addr6); } - conn->sock_port = ntohs(th->dest); - conn->tap_port = ntohs(th->source); + conn->fport = ntohs(th->dest); + conn->cport = ntohs(th->source); conn->seq_init_from_tap = ntohl(th->seq); conn->seq_from_tap = conn->seq_init_from_tap + 1; @@ -2754,10 +2755,10 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, SOCK_ACCEPTED); - inany_from_sockaddr(&conn->addr, &conn->sock_port, sa); - conn->tap_port = ref.r.p.tcp.tcp.index; + inany_from_sockaddr(&conn->faddr, &conn->fport, sa); + conn->cport = ref.r.p.tcp.tcp.index; - tcp_snat_inbound(c, &conn->addr); + tcp_snat_inbound(c, &conn->faddr); tcp_seq_init(c, conn, now); tcp_hash_insert(c, conn); diff --git a/tcp_conn.h b/tcp_conn.h index 9e2b1bf..ba2a1ef 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -35,9 +35,9 @@ extern const char *tcp_common_flag_str[]; * @ws_to_tap: Window scaling factor advertised to tap/guest * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS * @seq_dup_ack_approx: Last duplicate ACK number sent to tap - * @addr: Remote address (IPv4 or IPv6) - * @tap_port: Guest-facing tap port - * @sock_port: Remote, socket-facing port + * @faddr: Guest side forwarding address (guest's remote address) + * @cport: Guest side correspondent port (guest's local port) + * @fport: Guest side forwarding port (guest's remote port) * @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap @@ -105,9 +105,9 @@ struct tcp_tap_conn { uint8_t seq_dup_ack_approx; - union inany_addr addr; - in_port_t tap_port; - in_port_t sock_port; + union inany_addr faddr; + in_port_t cport; + in_port_t fport; uint16_t wnd_from_tap; uint16_t wnd_to_tap; -- 2.41.0
We partially prepopulate IP and TCP header structures including, amongst other things the destination address, which for IPv4 is always the known address of the guest/namespace. We partially precompute both the IPv4 header checksum and the TCP checksum based on this. In future we're going to want more flexibility with controlling the destination for IPv4 (as we already do for IPv6), so this precomputed value gets in the way. Therefore remove the IPv4 destination from the precomputed checksum and fold it into the checksum update when we actually send a packet. Doing this means we no longer need to recompute those partial sums when the destination address changes ({tcp,udp}_update_l2_buf()) and instead the computation can be moved to compile time. This means while we perform slightly more computations on each packet, we slightly reduce the amount of memory we need to access. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 61 ++++++++++++++++++++-------------------------------------- udp.c | 14 +++----------- util.h | 4 +++- 3 files changed, 27 insertions(+), 52 deletions(-) diff --git a/tcp.c b/tcp.c index c1bfc4f..c0bffb3 100644 --- a/tcp.c +++ b/tcp.c @@ -323,10 +323,8 @@ #define MSS_DEFAULT 536 struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ - uint32_t psum; - uint32_t tsum; #ifdef __AVX2__ - uint8_t pad[18]; + uint8_t pad[26]; #else uint8_t pad[2]; #endif @@ -443,8 +441,6 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; /** * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections - * @psum: Partial IP header checksum (excluding tot_len and saddr) - * @tsum: Partial TCP header checksum (excluding length and saddr) * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only * @taph: Tap-level headers (partially pre-filled) * @iph: Pre-filled IP header (except for tot_len and saddr) @@ -452,17 +448,15 @@ static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; * @data: Storage for TCP payload */ static struct tcp4_l2_buf_t { - uint32_t psum; /* 0 */ - uint32_t tsum; /* 4 */ #ifdef __AVX2__ - uint8_t pad[18]; /* 8, align th to 32 bytes */ + uint8_t pad[26]; /* 0, align th to 32 bytes */ #else - uint8_t pad[2]; /* align iph to 4 bytes 8 */ + uint8_t pad[2]; /* align iph to 4 bytes 0 */ #endif - struct tap_hdr taph; /* 26 10 */ - struct iphdr iph; /* 44 28 */ - struct tcphdr th; /* 64 48 */ - uint8_t data[MSS4]; /* 84 68 */ + struct tap_hdr taph; /* 26 2 */ + struct iphdr iph; /* 44 20 */ + struct tcphdr th; /* 64 40 */ + uint8_t data[MSS4]; /* 84 60 */ /* 65536 65532 */ #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) @@ -517,8 +511,6 @@ static struct iovec tcp_iov [UIO_MAXIOV]; /** * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) - * @psum: Partial IP header checksum (excluding tot_len and saddr) - * @tsum: Partial TCP header checksum (excluding length and saddr) * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only * @taph: Tap-level headers (partially pre-filled) * @iph: Pre-filled IP header (except for tot_len and saddr) @@ -526,16 +518,14 @@ static struct iovec tcp_iov [UIO_MAXIOV]; * @opts: Headroom for TCP options */ static struct tcp4_l2_flags_buf_t { - uint32_t psum; /* 0 */ - uint32_t tsum; /* 4 */ #ifdef __AVX2__ - uint8_t pad[18]; /* 8, align th to 32 bytes */ + uint8_t pad[26]; /* 0, align th to 32 bytes */ #else - uint8_t pad[2]; /* align iph to 4 bytes 8 */ + uint8_t pad[2]; /* align iph to 4 bytes 0 */ #endif - struct tap_hdr taph; /* 26 10 */ - struct iphdr iph; /* 44 28 */ - struct tcphdr th; /* 64 48 */ + struct tap_hdr taph; /* 26 2 */ + struct iphdr iph; /* 44 20 */ + struct tcphdr th; /* 64 40 */ char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) @@ -955,11 +945,13 @@ void tcp_sock_set_bufsize(const struct ctx *c, int s) */ static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf) { - uint32_t sum = buf->psum; + uint32_t sum = L2_BUF_IP4_PSUM(IPPROTO_TCP); sum += buf->iph.tot_len; sum += (buf->iph.saddr >> 16) & 0xffff; sum += buf->iph.saddr & 0xffff; + sum += (buf->iph.daddr >> 16) & 0xffff; + sum += buf->iph.daddr & 0xffff; buf->iph.check = (uint16_t)~csum_fold(sum); } @@ -971,10 +963,12 @@ static void tcp_update_check_ip4(struct tcp4_l2_buf_t *buf) static void tcp_update_check_tcp4(struct tcp4_l2_buf_t *buf) { uint16_t tlen = ntohs(buf->iph.tot_len) - 20; - uint32_t sum = buf->tsum; + uint32_t sum = htons(IPPROTO_TCP); sum += (buf->iph.saddr >> 16) & 0xffff; sum += buf->iph.saddr & 0xffff; + sum += (buf->iph.daddr >> 16) & 0xffff; + sum += buf->iph.daddr & 0xffff; sum += htons(ntohs(buf->iph.tot_len) - 20); buf->th.check = 0; @@ -1025,20 +1019,6 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, if (ip_da) { b4f->iph.daddr = b4->iph.daddr = ip_da->s_addr; - if (!i) { - b4f->iph.saddr = b4->iph.saddr = 0; - b4f->iph.tot_len = b4->iph.tot_len = 0; - b4f->iph.check = b4->iph.check = 0; - b4f->psum = b4->psum = sum_16b(&b4->iph, 20); - - b4->tsum = ((ip_da->s_addr >> 16) & 0xffff) + - (ip_da->s_addr & 0xffff) + - htons(IPPROTO_TCP); - b4f->tsum = b4->tsum; - } else { - b4f->psum = b4->psum = tcp4_l2_buf[0].psum; - b4f->tsum = b4->tsum = tcp4_l2_buf[0].tsum; - } } } } @@ -1047,15 +1027,16 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets * @c: Execution context */ -static void tcp_sock4_iov_init(const struct ctx *c) +static void tcp_sock4_iov_init(struct ctx *c) { + struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); struct iovec *iov; int i; for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { .taph = TAP_HDR_INIT(ETH_P_IP), - .iph = L2_BUF_IP4_INIT(IPPROTO_TCP), + .iph = iph, .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } }; } diff --git a/udp.c b/udp.c index 47b4a61..b82aea5 100644 --- a/udp.c +++ b/udp.c @@ -168,7 +168,6 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8) /** * udp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections * @s_in: Source socket address, filled in by recvmmsg() - * @psum: Partial IP header checksum (excluding tot_len and saddr) * @taph: Tap-level headers (partially pre-filled) * @iph: Pre-filled IP header (except for tot_len and saddr) * @uh: Headroom for UDP header @@ -176,7 +175,6 @@ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8) */ static struct udp4_l2_buf_t { struct sockaddr_in s_in; - uint32_t psum; struct tap_hdr taph; struct iphdr iph; @@ -263,11 +261,13 @@ static void udp_invert_portmap(struct udp_port_fwd *fwd) */ static void udp_update_check4(struct udp4_l2_buf_t *buf) { - uint32_t sum = buf->psum; + uint32_t sum = L2_BUF_IP4_PSUM(IPPROTO_UDP); sum += buf->iph.tot_len; sum += (buf->iph.saddr >> 16) & 0xffff; sum += buf->iph.saddr & 0xffff; + sum += (buf->iph.daddr >> 16) & 0xffff; + sum += buf->iph.daddr & 0xffff; buf->iph.check = (uint16_t)~csum_fold(sum); } @@ -292,14 +292,6 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, if (ip_da) { b4->iph.daddr = ip_da->s_addr; - if (!i) { - b4->iph.saddr = 0; - b4->iph.tot_len = 0; - b4->iph.check = 0; - b4->psum = sum_16b(&b4->iph, 20); - } else { - b4->psum = udp4_l2_buf[0].psum; - } } } } diff --git a/util.h b/util.h index 26892aa..a1c3829 100644 --- a/util.h +++ b/util.h @@ -141,11 +141,13 @@ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, .tot_len = 0, \ .id = 0, \ .frag_off = 0, \ - .ttl = 255, \ + .ttl = 0xff, \ .protocol = (proto), \ .saddr = 0, \ .daddr = 0, \ } +#define L2_BUF_IP4_PSUM(proto) ((uint32_t)htons_constant(0x4500) + \ + (uint32_t)htons_constant(0xff00 | (proto))) #define L2_BUF_IP6_INIT(proto) \ { \ -- 2.41.0
Because packets sent on the tap interface will always be going to the guest/namespace, we more-or-less know what address they'll be going to. So we pre-fill this destination address in our header buffers for IPv4. We can't do the same for IPv6 because we could need either the global or link-local address for the guest. In future we're going to want more flexibility for the destination address, so this pre-filling will get in the way. Change the flow so we always fill in the IPv4 destination address for each packet, rather than prefilling it from proto_update_l2_buf(). In fact for TCP we already redundantly filled the destination for each packet anyway. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- passt.c | 10 ++++------ passt.h | 4 ++-- pasta.c | 2 +- tap.c | 8 +++----- tcp.c | 8 +------- tcp.h | 3 +-- udp.c | 9 ++------- udp.h | 3 +-- 8 files changed, 15 insertions(+), 32 deletions(-) diff --git a/passt.c b/passt.c index 3b9b36b..05672eb 100644 --- a/passt.c +++ b/passt.c @@ -135,13 +135,11 @@ static void timer_init(struct ctx *c, const struct timespec *now) * proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers * @eth_d: Ethernet destination address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged - * @ip_da: Pointer to IPv4 destination address, NULL if unchanged */ -void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, - const struct in_addr *ip_da) +void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { - tcp_update_l2_buf(eth_d, eth_s, ip_da); - udp_update_l2_buf(eth_d, eth_s, ip_da); + tcp_update_l2_buf(eth_d, eth_s); + udp_update_l2_buf(eth_d, eth_s); } /** @@ -265,7 +263,7 @@ int main(int argc, char **argv) if (!c.no_icmp) icmp_init(); - proto_update_l2_buf(c.mac_guest, c.mac, &c.ip4.addr); + proto_update_l2_buf(c.mac_guest, c.mac); if (c.ifi4 && !c.no_dhcp) dhcp_init(); diff --git a/passt.h b/passt.h index 96fd27b..a40cbda 100644 --- a/passt.h +++ b/passt.h @@ -267,7 +267,7 @@ struct ctx { int low_rmem; }; -void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, - const struct in_addr *ip_da); +void proto_update_l2_buf(const unsigned char *eth_d, + const unsigned char *eth_s); #endif /* PASST_H */ diff --git a/pasta.c b/pasta.c index 8c85546..3b73cb2 100644 --- a/pasta.c +++ b/pasta.c @@ -298,7 +298,7 @@ void pasta_ns_conf(struct ctx *c) nl_link(1, c->pasta_ifi, c->mac_guest, 0, 0); } - proto_update_l2_buf(c->mac_guest, NULL, NULL); + proto_update_l2_buf(c->mac_guest, NULL); } /** diff --git a/tap.c b/tap.c index 5e1daf8..8024c4b 100644 --- a/tap.c +++ b/tap.c @@ -624,10 +624,8 @@ resume: l4_len = l3_len - hlen; - if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr) { + if (iph->saddr && c->ip4.addr_seen.s_addr != iph->saddr) c->ip4.addr_seen.s_addr = iph->saddr; - proto_update_l2_buf(NULL, NULL, &c->ip4.addr_seen); - } l4h = packet_get(in, i, sizeof(*eh) + hlen, l4_len, NULL); if (!l4h) @@ -950,7 +948,7 @@ redo: if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL, NULL); + proto_update_l2_buf(c->mac_guest, NULL); } switch (ntohs(eh->h_proto)) { @@ -1010,7 +1008,7 @@ restart: if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) { memcpy(c->mac_guest, eh->h_source, ETH_ALEN); - proto_update_l2_buf(c->mac_guest, NULL, NULL); + proto_update_l2_buf(c->mac_guest, NULL); } switch (ntohs(eh->h_proto)) { diff --git a/tcp.c b/tcp.c index c0bffb3..ac7ae60 100644 --- a/tcp.c +++ b/tcp.c @@ -999,10 +999,8 @@ static void tcp_update_check_tcp6(struct tcp6_l2_buf_t *buf) * tcp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses * @eth_d: Ethernet destination address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged - * @ip_da: Pointer to IPv4 destination address, NULL if unchanged */ -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, - const struct in_addr *ip_da) +void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { int i; @@ -1016,10 +1014,6 @@ void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, tap_update_mac(&b6->taph, eth_d, eth_s); tap_update_mac(&b4f->taph, eth_d, eth_s); tap_update_mac(&b6f->taph, eth_d, eth_s); - - if (ip_da) { - b4f->iph.daddr = b4->iph.daddr = ip_da->s_addr; - } } } diff --git a/tcp.h b/tcp.h index 66a73eb..97de89e 100644 --- a/tcp.h +++ b/tcp.h @@ -24,8 +24,7 @@ void tcp_timer(struct ctx *c, const struct timespec *ts); void tcp_defer_handler(struct ctx *c); void tcp_sock_set_bufsize(const struct ctx *c, int s); -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, - const struct in_addr *ip_da); +void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); /** * union tcp_epoll_ref - epoll reference portion for TCP connections diff --git a/udp.c b/udp.c index b82aea5..3262842 100644 --- a/udp.c +++ b/udp.c @@ -276,10 +276,8 @@ static void udp_update_check4(struct udp4_l2_buf_t *buf) * udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses * @eth_d: Ethernet destination address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged - * @ip_da: Pointer to IPv4 destination address, NULL if unchanged */ -void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, - const struct in_addr *ip_da) +void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { int i; @@ -289,10 +287,6 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, tap_update_mac(&b4->taph, eth_d, eth_s); tap_update_mac(&b6->taph, eth_d, eth_s); - - if (ip_da) { - b4->iph.daddr = ip_da->s_addr; - } } } @@ -579,6 +573,7 @@ static size_t udp_update_hdr4(const struct ctx *c, int n, in_port_t dstport, ip_len = udp4_l2_mh_sock[n].msg_len + sizeof(b->iph) + sizeof(b->uh); b->iph.tot_len = htons(ip_len); + b->iph.daddr = c->ip4.addr_seen.s_addr; src_port = ntohs(b->s_in.sin_port); diff --git a/udp.h b/udp.h index 060ae35..a3599b4 100644 --- a/udp.h +++ b/udp.h @@ -16,8 +16,7 @@ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port); int udp_init(struct ctx *c); void udp_timer(struct ctx *c, const struct timespec *ts); -void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s, - const struct in_addr *ip_da); +void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s); /** * union udp_epoll_ref - epoll reference portion for TCP connections -- 2.41.0
Currently the only address we explicitly track in the TCP connection structure is the tap side forwarding address - that is the remote address from the guest's point of view. The tap side correspondent address - the local address from the guest's point of view - is assumed to always be one of the handful of guest addresses we track as addr_seen (one each for IPv4, IPv6 global and IPv6 link-local). We want to generalize our forwarding model to allow the guest to have multiple addresses. As a start on this, track the tap-side correspondent address in the connection structure, only using one of the addr_seen variables when we start a new connection. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 37 +++++++++++++++++-------------------- tcp_conn.h | 2 ++ 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/tcp.c b/tcp.c index ac7ae60..6c4d71e 100644 --- a/tcp.c +++ b/tcp.c @@ -1449,7 +1449,7 @@ do { \ ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); b->iph.tot_len = htons(ip_len); b->iph.saddr = a4->s_addr; - b->iph.daddr = c->ip4.addr_seen.s_addr; + b->iph.daddr = inany_v4(&conn->caddr)->s_addr; if (check) b->iph.check = *check; @@ -1468,10 +1468,7 @@ do { \ b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); b->ip6h.saddr = conn->faddr.a6; - if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr)) - b->ip6h.daddr = c->ip6.addr_ll_seen; - else - b->ip6h.daddr = c->ip6.addr_seen; + b->ip6h.daddr = conn->caddr.a6; memset(b->ip6h.flow_lbl, 0, 3); @@ -1820,13 +1817,12 @@ static void tcp_clamp_window(const struct ctx *c, struct tcp_tap_conn *conn, /** * tcp_seq_init() - Calculate initial sequence number according to RFC 6528 * @c: Execution context - * @conn: TCP connection, with faddr, fport and cport populated + * @conn: TCP connection, with faddr, fport, caddr, cport populated * @now: Current timestamp */ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, const struct timespec *now) { - union inany_addr aany; struct { union inany_addr src; in_port_t srcport; @@ -1835,16 +1831,11 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, } __attribute__((__packed__)) in = { .src = conn->faddr, .srcport = conn->cport, + .dst = conn->caddr, .dstport = conn->fport, }; uint32_t ns, seq = 0; - if (CONN_V4(conn)) - inany_from_af(&aany, AF_INET, &c->ip4.addr); - else - inany_from_af(&aany, AF_INET6, &c->ip6.addr); - in.dst = aany; - seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret); /* 32ns ticks, overflows 32 bits every 137s */ @@ -2011,8 +2002,6 @@ static void tcp_conn_from_tap(struct ctx *c, socklen_t sl; int s, mss; - (void)saddr; - if (c->tcp.conn_count >= TCP_MAX_CONNS) return; @@ -2061,6 +2050,9 @@ static void tcp_conn_from_tap(struct ctx *c, conn->wnd_from_tap = 1; inany_from_af(&conn->faddr, af, daddr); + inany_from_af(&conn->caddr, af, saddr); + conn->fport = ntohs(th->dest); + conn->cport = ntohs(th->source); if (af == AF_INET) { sa = (struct sockaddr *)&addr4; @@ -2070,9 +2062,6 @@ static void tcp_conn_from_tap(struct ctx *c, sl = sizeof(addr6); } - conn->fport = ntohs(th->dest); - conn->cport = ntohs(th->source); - conn->seq_init_from_tap = ntohl(th->seq); conn->seq_from_tap = conn->seq_init_from_tap + 1; conn->seq_ack_to_tap = conn->seq_from_tap; @@ -2731,10 +2720,18 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, conn_event(c, conn, SOCK_ACCEPTED); inany_from_sockaddr(&conn->faddr, &conn->fport, sa); - conn->cport = ref.r.p.tcp.tcp.index; - tcp_snat_inbound(c, &conn->faddr); + if (CONN_V4(conn)) { + inany_from_af(&conn->caddr, AF_INET, &c->ip4.addr_seen); + } else { + if (IN6_IS_ADDR_LINKLOCAL(&conn->faddr.a6)) + conn->caddr.a6 = c->ip6.addr_ll_seen; + else + conn->caddr.a6 = c->ip6.addr_seen; + } + conn->cport = ref.r.p.tcp.tcp.index; + tcp_seq_init(c, conn, now); tcp_hash_insert(c, conn); diff --git a/tcp_conn.h b/tcp_conn.h index ba2a1ef..9151c18 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -35,6 +35,7 @@ extern const char *tcp_common_flag_str[]; * @ws_to_tap: Window scaling factor advertised to tap/guest * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS * @seq_dup_ack_approx: Last duplicate ACK number sent to tap + * @caddr: Guest side correspondent address (guest's local address) * @faddr: Guest side forwarding address (guest's remote address) * @cport: Guest side correspondent port (guest's local port) * @fport: Guest side forwarding port (guest's remote port) @@ -105,6 +106,7 @@ struct tcp_tap_conn { uint8_t seq_dup_ack_approx; + union inany_addr caddr; union inany_addr faddr; in_port_t cport; in_port_t fport; -- 2.41.0
For TCP tap connections we keep track of both the IP address and port for each side of a connection as seen by the guest. We're planning to track similar information in a number of other places as well. To assist with this, create a new structure: struct demiflow to track both sides of a connection or other logical packet flow as seen from a single "side" of passt. Also add a small helper function for initializing this structure. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.h | 41 ++++++++++++++++++++++++++++++++++++ tcp.c | 59 ++++++++++++++++++++++++++-------------------------- tcp_conn.h | 11 ++-------- tcp_splice.c | 1 + 4 files changed, 74 insertions(+), 38 deletions(-) create mode 100644 flow.h diff --git a/flow.h b/flow.h new file mode 100644 index 0000000..f7c0981 --- /dev/null +++ b/flow.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright Red Hat + * Author: David Gibson <david(a)gibson.dropbear.id.au> + * + * Tracking for logical "flows" of packets. + */ +#ifndef FLOW_H +#define FLOW_H + +/** + * struct demiflow - Describes a logical packet flow as seen from one "side" + * @caddr: Correspondent address (remote address from passt's PoV) + * @faddr: Forwarding address (local address from passt's PoV) + * @cport: Correspondent port + * @fport: Forwarding port + */ +struct demiflow { + union inany_addr faddr; + union inany_addr caddr; + in_port_t fport, cport; +}; + +/** demiflow_from_af - Initialize a demiflow from addresses + * @df: demiflow to initialize + * @af: Address family for @faddr and @caddr + * @faddr: Forwarding address (pointer to in_addr or in6_addr) + * @fport: Forwarding port + * @caddr: Correspondent address (pointer to in_addr or in6_addr) + * @cport: Correspondent port + */ +static inline void demiflow_from_af(struct demiflow *df, int af, + const void *faddr, in_port_t fport, + const void *caddr, in_port_t cport) +{ + inany_from_af(&df->faddr, af, faddr); + inany_from_af(&df->caddr, af, caddr); + df->fport = fport; + df->cport = cport; +} + +#endif /* FLOW_H */ diff --git a/tcp.c b/tcp.c index 6c4d71e..c1875c3 100644 --- a/tcp.c +++ b/tcp.c @@ -302,6 +302,7 @@ #include "tcp_splice.h" #include "log.h" #include "inany.h" +#include "flow.h" #include "tcp_conn.h" @@ -399,7 +400,7 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ #define OPT_SACK 5 #define OPT_TS 8 -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V4(conn) (!!inany_v4(&(conn)->tapflow.faddr)) #define CONN_V6(conn) (!CONN_V4(conn)) #define CONN_IS_CLOSING(conn) \ ((conn->events & ESTABLISHED) && \ @@ -852,7 +853,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (inany_equals(&conn->faddr, low_rtt_dst + i)) + if (inany_equals(&conn->tapflow.faddr, low_rtt_dst + i)) return 1; return 0; @@ -874,7 +875,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (inany_equals(&conn->faddr, low_rtt_dst + i)) + if (inany_equals(&conn->tapflow.faddr, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -886,7 +887,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, if (hole == -1) return; - low_rtt_dst[hole++] = conn->faddr; + low_rtt_dst[hole++] = conn->tapflow.faddr; if (hole == LOW_RTT_TABLE_SIZE) hole = 0; inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); @@ -1151,8 +1152,8 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn, const union inany_addr *faddr, in_port_t cport, in_port_t fport) { - if (inany_equals(&conn->faddr, faddr) && - conn->cport == cport && conn->fport == fport) + if (inany_equals(&conn->tapflow.faddr, faddr) && + conn->tapflow.cport == cport && conn->tapflow.fport == fport) return 1; return 0; @@ -1194,7 +1195,8 @@ static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr, static unsigned int tcp_conn_hash(const struct ctx *c, const struct tcp_tap_conn *conn) { - return tcp_hash(c, &conn->faddr, conn->cport, conn->fport); + return tcp_hash(c, &conn->tapflow.faddr, + conn->tapflow.cport, conn->tapflow.fport); } /** @@ -1206,7 +1208,8 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { int b; - b = tcp_hash(c, &conn->faddr, conn->cport, conn->fport); + b = tcp_hash(c, &conn->tapflow.faddr, + conn->tapflow.cport, conn->tapflow.fport); conn->next_index = tc_hash[b] ? CONN_IDX(tc_hash[b]) : -1; tc_hash[b] = conn; @@ -1425,13 +1428,13 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, void *p, size_t plen, const uint16_t *check, uint32_t seq) { - const struct in_addr *a4 = inany_v4(&conn->faddr); + const struct in_addr *a4 = inany_v4(&conn->tapflow.faddr); size_t ip_len, tlen; #define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \ do { \ - b->th.source = htons(conn->fport); \ - b->th.dest = htons(conn->cport); \ + b->th.source = htons(conn->tapflow.fport); \ + b->th.dest = htons(conn->tapflow.cport); \ b->th.seq = htonl(seq); \ b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ if (conn->events & ESTABLISHED) { \ @@ -1449,7 +1452,7 @@ do { \ ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); b->iph.tot_len = htons(ip_len); b->iph.saddr = a4->s_addr; - b->iph.daddr = inany_v4(&conn->caddr)->s_addr; + b->iph.daddr = inany_v4(&conn->tapflow.caddr)->s_addr; if (check) b->iph.check = *check; @@ -1467,8 +1470,8 @@ do { \ ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); - b->ip6h.saddr = conn->faddr.a6; - b->ip6h.daddr = conn->caddr.a6; + b->ip6h.saddr = conn->tapflow.faddr.a6; + b->ip6h.daddr = conn->tapflow.caddr.a6; memset(b->ip6h.flow_lbl, 0, 3); @@ -1829,10 +1832,10 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, union inany_addr dst; in_port_t dstport; } __attribute__((__packed__)) in = { - .src = conn->faddr, - .srcport = conn->cport, - .dst = conn->caddr, - .dstport = conn->fport, + .src = conn->tapflow.faddr, + .srcport = conn->tapflow.cport, + .dst = conn->tapflow.caddr, + .dstport = conn->tapflow.fport, }; uint32_t ns, seq = 0; @@ -2049,10 +2052,8 @@ static void tcp_conn_from_tap(struct ctx *c, if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) conn->wnd_from_tap = 1; - inany_from_af(&conn->faddr, af, daddr); - inany_from_af(&conn->caddr, af, saddr); - conn->fport = ntohs(th->dest); - conn->cport = ntohs(th->source); + demiflow_from_af(&conn->tapflow, af, daddr, ntohs(th->dest), + saddr, ntohs(th->source)); if (af == AF_INET) { sa = (struct sockaddr *)&addr4; @@ -2719,18 +2720,18 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, SOCK_ACCEPTED); - inany_from_sockaddr(&conn->faddr, &conn->fport, sa); - tcp_snat_inbound(c, &conn->faddr); + inany_from_sockaddr(&conn->tapflow.faddr, &conn->tapflow.fport, sa); + tcp_snat_inbound(c, &conn->tapflow.faddr); if (CONN_V4(conn)) { - inany_from_af(&conn->caddr, AF_INET, &c->ip4.addr_seen); + inany_from_af(&conn->tapflow.caddr, AF_INET, &c->ip4.addr_seen); } else { - if (IN6_IS_ADDR_LINKLOCAL(&conn->faddr.a6)) - conn->caddr.a6 = c->ip6.addr_ll_seen; + if (IN6_IS_ADDR_LINKLOCAL(&conn->tapflow.faddr.a6)) + conn->tapflow.caddr.a6 = c->ip6.addr_ll_seen; else - conn->caddr.a6 = c->ip6.addr_seen; + conn->tapflow.caddr.a6 = c->ip6.addr_seen; } - conn->cport = ref.r.p.tcp.tcp.index; + conn->tapflow.cport = ref.r.p.tcp.tcp.index; tcp_seq_init(c, conn, now); tcp_hash_insert(c, conn); diff --git a/tcp_conn.h b/tcp_conn.h index 9151c18..92d4637 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -35,10 +35,7 @@ extern const char *tcp_common_flag_str[]; * @ws_to_tap: Window scaling factor advertised to tap/guest * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS * @seq_dup_ack_approx: Last duplicate ACK number sent to tap - * @caddr: Guest side correspondent address (guest's local address) - * @faddr: Guest side forwarding address (guest's remote address) - * @cport: Guest side correspondent port (guest's local port) - * @fport: Guest side forwarding port (guest's remote port) + * @tapflow: Tap(guest)-side demiflow * @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap @@ -105,11 +102,7 @@ struct tcp_tap_conn { uint8_t seq_dup_ack_approx; - - union inany_addr caddr; - union inany_addr faddr; - in_port_t cport; - in_port_t fport; + struct demiflow tapflow; uint16_t wnd_from_tap; uint16_t wnd_to_tap; diff --git a/tcp_splice.c b/tcp_splice.c index 71256b0..a1aeff7 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -53,6 +53,7 @@ #include "log.h" #include "tcp_splice.h" #include "inany.h" +#include "flow.h" #include "tcp_conn.h" -- 2.41.0
Currently we match TCP packets received on the tap connection to a TCP connection via a hash table based on the forwarding address and both ports. We hope in future to allow for multiple guest side addresses, which means we may need to distinguish based on the correspondent address as well. Extend the hash function to include this information. Since this now exactly corresponds to the contents of the guest-side demiflow, we can base our hash functions on that, rather than a group of individual parameters. We also put some of the helpers in flow.h, because we hope to be able to re-use the hashing logic for other cases in future as well. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- flow.h | 25 ++++++++++++++++++++ siphash.c | 1 + tcp.c | 65 +++++++++++++--------------------------------------- tcp_splice.c | 1 + 4 files changed, 43 insertions(+), 49 deletions(-) diff --git a/flow.h b/flow.h index f7c0981..bb8e314 100644 --- a/flow.h +++ b/flow.h @@ -38,4 +38,29 @@ static inline void demiflow_from_af(struct demiflow *df, int af, df->cport = cport; } +/** + * demiflow_eq() - Check if two demiflows are equal + * @left, @right: Demiflows to compare + * + * Return: true if equal, false otherwise + */ +static inline bool demiflow_eq(const struct demiflow *left, + const struct demiflow *right) +{ + return memcmp(left, right, sizeof(struct demiflow)) == 0; +} + +/** + * demiflow_hash() - Calculate hash value for a demiflow + * @df: Demiflow + * @k: Hash secret (128-bits as array of 2 64-bit words) + * + * Return: hash value + */ +static inline unsigned int demiflow_hash(const struct demiflow *df, + const uint64_t *k) +{ + return siphash_36b((uint8_t *)df, k); +} + #endif /* FLOW_H */ diff --git a/siphash.c b/siphash.c index e266e15..1f424d8 100644 --- a/siphash.c +++ b/siphash.c @@ -163,6 +163,7 @@ uint32_t siphash_12b(const uint8_t *in, const uint64_t *k) */ /* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ __attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ +/* cppcheck-suppress unusedFunction */ uint64_t siphash_20b(const uint8_t *in, const uint64_t *k) { uint32_t *in32 = (uint32_t *)in; diff --git a/tcp.c b/tcp.c index c1875c3..92aa956 100644 --- a/tcp.c +++ b/tcp.c @@ -1140,49 +1140,15 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, } /** - * tcp_hash_match() - Check if a connection entry matches address and ports - * @conn: Connection entry to match against - * @faddr: Guest side forwarding address - * @cport: Guest side correspondent port - * @fport: Guest side forwarding port - * - * Return: 1 on match, 0 otherwise - */ -static int tcp_hash_match(const struct tcp_tap_conn *conn, - const union inany_addr *faddr, - in_port_t cport, in_port_t fport) -{ - if (inany_equals(&conn->tapflow.faddr, faddr) && - conn->tapflow.cport == cport && conn->tapflow.fport == fport) - return 1; - - return 0; -} - -/** - * tcp_hash() - Calculate hash value for connection given address and ports + * tcp_hash() - Calculate hash value for a TCP guest-side demiflow * @c: Execution context - * @faddr: Guest side forwarding address - * @cport: Guest side correspondent port - * @fport: Guest side forwarding port + * @df: Guest side demiflow * * Return: hash value, already modulo size of the hash table */ -static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr, - in_port_t cport, in_port_t fport) +static unsigned int tcp_hash(const struct ctx *c, const struct demiflow *df) { - struct { - union inany_addr faddr; - in_port_t cport; - in_port_t fport; - } __attribute__((__packed__)) in = { - *faddr, cport, fport - }; - uint64_t b = 0; - - b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret); - - return (unsigned int)(b % TCP_HASH_TABLE_SIZE); + return demiflow_hash(df, c->tcp.hash_secret) % TCP_HASH_TABLE_SIZE; } /** @@ -1195,8 +1161,7 @@ static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *faddr, static unsigned int tcp_conn_hash(const struct ctx *c, const struct tcp_tap_conn *conn) { - return tcp_hash(c, &conn->tapflow.faddr, - conn->tapflow.cport, conn->tapflow.fport); + return tcp_hash(c, &conn->tapflow); } /** @@ -1208,8 +1173,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) { int b; - b = tcp_hash(c, &conn->tapflow.faddr, - conn->tapflow.cport, conn->tapflow.fport); + b = tcp_hash(c, &conn->tapflow); conn->next_index = tc_hash[b] ? CONN_IDX(tc_hash[b]) : -1; tc_hash[b] = conn; @@ -1278,24 +1242,26 @@ static void tcp_tap_conn_update(struct ctx *c, struct tcp_tap_conn *old, * tcp_hash_lookup() - Look up connection given remote address and ports * @c: Execution context * @af: Address family, AF_INET or AF_INET6 + * @caddr: Guest side correspondent address (guest local address) * @faddr: Guest side forwarding address (guest remote address) * @cport: Guest side correspondent port (guest local port) * @fport: Guest side forwarding port (guest remote port) * * Return: connection pointer, if found, -ENOENT otherwise */ -static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, - int af, const void *faddr, +static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, int af, + const void *caddr, const void *faddr, in_port_t cport, in_port_t fport) { - union inany_addr aany; + struct demiflow df; struct tcp_tap_conn *conn; int b; - inany_from_af(&aany, af, faddr); - b = tcp_hash(c, &aany, cport, fport); + demiflow_from_af(&df, af, faddr, fport, caddr, cport); + + b = tcp_hash(c, &df); for (conn = tc_hash[b]; conn; conn = conn_at_idx(conn->next_index)) { - if (tcp_hash_match(conn, &aany, cport, fport)) + if (demiflow_eq(&conn->tapflow, &df)) return conn; } @@ -2556,7 +2522,8 @@ int tcp_tap_handler(struct ctx *c, int af, const void *saddr, const void *daddr, optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); opts = packet_get(p, 0, sizeof(*th), optlen, NULL); - conn = tcp_hash_lookup(c, af, daddr, htons(th->source), htons(th->dest)); + conn = tcp_hash_lookup(c, af, saddr, daddr, + htons(th->source), htons(th->dest)); /* New connection from tap */ if (!conn) { diff --git a/tcp_splice.c b/tcp_splice.c index a1aeff7..15abc0c 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -51,6 +51,7 @@ #include "util.h" #include "passt.h" #include "log.h" +#include "siphash.h" #include "tcp_splice.h" #include "inany.h" #include "flow.h" -- 2.41.0