We want to use the same sockets to listen on both IPv4 and IPv6 addresses (at least on Linux which allows this). This isn't too complicated conceptually, but to make it work we need to unify handling of v4 and v6 connections more than we currently have. This isn't really ready to go, but I'm posting for reference. It's divided into rather a lot of steps, because I kept hitting problems and not being sure exactly what change had broken things. Based on my earlier series making handling of IPv4 addresses endian safer. David Gibson (10): tcp: no v6 flag in ref tcp: Helper to encode IPv4-mapped IPv6 addresses tcp: Partially unify IPv4 and IPv6 paths in tcp_hash_match() tcp: Hash IPv4 and IPv4-mapped-IPv6 addresses the same tcp: Take tcp_hash_insert() address from struct tcp_conn tcp: Unify IPv4 and IPv6 paths for hashing and matching tcp: Remove ugly address union from struct tcp_conn tcp: Unify initial sequence numbers for IPv4 and IPv6 tcp: Have tcp_seq_init() take its parameters from struct tcp_conn tcp: Fix small error in tcp_seq_init() time handling siphash.c | 2 + tcp.c | 202 ++++++++++++++++++--------------------------------- tcp.h | 1 - tcp_splice.c | 13 ++-- util.c | 33 +++++++++ util.h | 2 + 6 files changed, 116 insertions(+), 137 deletions(-) -- 2.38.1
--- tcp.c | 9 ++++----- tcp.h | 1 - tcp_splice.c | 13 +++++++------ 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tcp.c b/tcp.c index 713248f..3d48d6e 100644 --- a/tcp.c +++ b/tcp.c @@ -761,8 +761,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn) { int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, - .r.p.tcp.tcp.index = conn - tc, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + .r.p.tcp.tcp.index = conn - tc, }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { @@ -2857,6 +2856,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, if (c->tcp.conn_count >= TCP_MAX_CONNS) return; + sa.ss_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */ sl = sizeof(sa); s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); if (s < 0) @@ -2868,7 +2868,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, SOCK_ACCEPTED); - if (ref.r.p.tcp.tcp.v6) { + if (sa.ss_family == AF_INET6) { struct sockaddr_in6 sa6; memcpy(&sa6, &sa, sizeof(sa6)); @@ -3147,8 +3147,7 @@ static void tcp_sock_init6(const struct ctx *c, int ns, const struct in6_addr *addr, const char *ifname, in_port_t port) { - union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns, - .tcp.v6 = 1 }; + union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns, }; bool spliced = false, tap = true; int s; diff --git a/tcp.h b/tcp.h index 3fabb5a..85ac750 100644 --- a/tcp.h +++ b/tcp.h @@ -45,7 +45,6 @@ union tcp_epoll_ref { uint32_t listen:1, splice:1, outbound:1, - v6:1, timer:1, index:20; } tcp; diff --git a/tcp_splice.c b/tcp_splice.c index 99c5fa7..1f26ff4 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -211,12 +211,10 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, .r.p.tcp.tcp.splice = 1, - .r.p.tcp.tcp.index = conn - tc, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + .r.p.tcp.tcp.index = conn - tc, }; union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b, .r.p.tcp.tcp.splice = 1, - .r.p.tcp.tcp.index = conn - tc, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + .r.p.tcp.tcp.index = conn - tc, }; struct epoll_event ev_a = { .data.u64 = ref_a.u64 }; struct epoll_event ev_b = { .data.u64 = ref_b.u64 }; uint32_t events_a, events_b; @@ -579,12 +577,15 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, struct tcp_splice_conn *conn; if (ref.r.p.tcp.tcp.listen) { + struct sockaddr sa; + socklen_t sl = sizeof(sa); int s; if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS) return; - if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0) + sa.sa_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */ + if ((s = accept4(ref.r.s, &sa, &sl, SOCK_NONBLOCK)) < 0) return; if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), @@ -595,7 +596,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, conn = CONN(c->tcp.splice_conn_count++); conn->a = s; - conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0; + conn->flags = (sa.sa_family == AF_INET6) ? SOCK_V6 : 0; if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index, ref.r.p.tcp.tcp.outbound)) -- 2.38.1
On Fri, 4 Nov 2022 19:43:24 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:--- tcp.c | 9 ++++----- tcp.h | 1 - tcp_splice.c | 13 +++++++------ 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tcp.c b/tcp.c index 713248f..3d48d6e 100644 --- a/tcp.c +++ b/tcp.c @@ -761,8 +761,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn) { int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, - .r.p.tcp.tcp.index = conn - tc, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + .r.p.tcp.tcp.index = conn - tc, }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { @@ -2857,6 +2856,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, if (c->tcp.conn_count >= TCP_MAX_CONNS) return; + sa.ss_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */No need for /* FIXME */ here in my opinion, valgrind should also hit this -- perhaps move to initialiser though.sl = sizeof(sa); s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); if (s < 0) @@ -2868,7 +2868,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, SOCK_ACCEPTED); - if (ref.r.p.tcp.tcp.v6) { + if (sa.ss_family == AF_INET6) { struct sockaddr_in6 sa6; memcpy(&sa6, &sa, sizeof(sa6)); @@ -3147,8 +3147,7 @@ static void tcp_sock_init6(const struct ctx *c, int ns, const struct in6_addr *addr, const char *ifname, in_port_t port) { - union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns, - .tcp.v6 = 1 }; + union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns, };Undocumented style rule: no comma at the end, it's post-modern C anyway.bool spliced = false, tap = true; int s; diff --git a/tcp.h b/tcp.h index 3fabb5a..85ac750 100644 --- a/tcp.h +++ b/tcp.h @@ -45,7 +45,6 @@ union tcp_epoll_ref { uint32_t listen:1, splice:1, outbound:1, - v6:1,Don't forget to update the comment above.timer:1, index:20; } tcp; diff --git a/tcp_splice.c b/tcp_splice.c index 99c5fa7..1f26ff4 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -211,12 +211,10 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, .r.p.tcp.tcp.splice = 1, - .r.p.tcp.tcp.index = conn - tc, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + .r.p.tcp.tcp.index = conn - tc, }; union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b, .r.p.tcp.tcp.splice = 1, - .r.p.tcp.tcp.index = conn - tc, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + .r.p.tcp.tcp.index = conn - tc, };Commas.struct epoll_event ev_a = { .data.u64 = ref_a.u64 }; struct epoll_event ev_b = { .data.u64 = ref_b.u64 }; uint32_t events_a, events_b; @@ -579,12 +577,15 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, struct tcp_splice_conn *conn; if (ref.r.p.tcp.tcp.listen) { + struct sockaddr sa;Should this be sockaddr_storage in this case and then cast? I never remember.+ socklen_t sl = sizeof(sa); int s; if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS) return; - if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0) + sa.sa_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */ + if ((s = accept4(ref.r.s, &sa, &sl, SOCK_NONBLOCK)) < 0)Same comment about /* FIXME */ as above. This adds a copy_to_user() and microseconds, but it's unavoidable. I tried harder to optimise for latency on the spliced path, that's why it was NULL here and not above.return; if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), @@ -595,7 +596,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, conn = CONN(c->tcp.splice_conn_count++); conn->a = s; - conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0; + conn->flags = (sa.sa_family == AF_INET6) ? SOCK_V6 : 0; if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index, ref.r.p.tcp.tcp.outbound))-- Stefano
On Mon, Nov 07, 2022 at 07:07:53PM +0100, Stefano Brivio wrote:On Fri, 4 Nov 2022 19:43:24 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:"FIXME" maybe isn't quite the right sentiment. The issue here is I don't think this should be necessary. AFAIK sa.ss_family will be written on any succesful return from accept4(), it's just the clang-tidy doesn't seem to know that.--- tcp.c | 9 ++++----- tcp.h | 1 - tcp_splice.c | 13 +++++++------ 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/tcp.c b/tcp.c index 713248f..3d48d6e 100644 --- a/tcp.c +++ b/tcp.c @@ -761,8 +761,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn) { int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, - .r.p.tcp.tcp.index = conn - tc, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + .r.p.tcp.tcp.index = conn - tc, }; struct epoll_event ev = { .data.u64 = ref.u64 }; if (conn->events == CLOSED) { @@ -2857,6 +2856,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, if (c->tcp.conn_count >= TCP_MAX_CONNS) return; + sa.ss_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */No need for /* FIXME */ here in my opinion,valgrind should also hit thisI'm not sure. Depends whether valgrind's knowledge of syscall behaviours has the same bug or not.-- perhaps move to initialiser though.Fixed.sl = sizeof(sa); s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK); if (s < 0) @@ -2868,7 +2868,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, SOCK_ACCEPTED); - if (ref.r.p.tcp.tcp.v6) { + if (sa.ss_family == AF_INET6) { struct sockaddr_in6 sa6; memcpy(&sa6, &sa, sizeof(sa6)); @@ -3147,8 +3147,7 @@ static void tcp_sock_init6(const struct ctx *c, int ns, const struct in6_addr *addr, const char *ifname, in_port_t port) { - union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns, - .tcp.v6 = 1 }; + union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns, };Undocumented style rule: no comma at the end, it's post-modern C anyway.Fixed.bool spliced = false, tap = true; int s; diff --git a/tcp.h b/tcp.h index 3fabb5a..85ac750 100644 --- a/tcp.h +++ b/tcp.h @@ -45,7 +45,6 @@ union tcp_epoll_ref { uint32_t listen:1, splice:1, outbound:1, - v6:1,Don't forget to update the comment above.Fixed.timer:1, index:20; } tcp; diff --git a/tcp_splice.c b/tcp_splice.c index 99c5fa7..1f26ff4 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -211,12 +211,10 @@ static int tcp_splice_epoll_ctl(const struct ctx *c, int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, .r.p.tcp.tcp.splice = 1, - .r.p.tcp.tcp.index = conn - tc, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + .r.p.tcp.tcp.index = conn - tc, }; union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b, .r.p.tcp.tcp.splice = 1, - .r.p.tcp.tcp.index = conn - tc, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + .r.p.tcp.tcp.index = conn - tc, };Commas.So, for this patch, no it doesn't, but it will have to change later, in fact. The trick here is that we only want the sa_family field, we don't need the full socket address. struct sockaddr has that (and maybe a little extra, but not much). accept4() will truncate the socket address when it writes, but that's ok for our purposes. Using the short form may mitigate the concern below because there's less data to copy_to_user(). Of course, when I change this later to add different handling for IPv4-mapped IPv6 source addresses we will need the full address, so this will need to be a sockaddr_in6.struct epoll_event ev_a = { .data.u64 = ref_a.u64 }; struct epoll_event ev_b = { .data.u64 = ref_b.u64 }; uint32_t events_a, events_b; @@ -579,12 +577,15 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, struct tcp_splice_conn *conn; if (ref.r.p.tcp.tcp.listen) { + struct sockaddr sa;Should this be sockaddr_storage in this case and then cast? I never remember.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson+ socklen_t sl = sizeof(sa); int s; if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS) return; - if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0) + sa.sa_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */ + if ((s = accept4(ref.r.s, &sa, &sl, SOCK_NONBLOCK)) < 0)Same comment about /* FIXME */ as above. This adds a copy_to_user() and microseconds, but it's unavoidable. I tried harder to optimise for latency on the spliced path, that's why it was NULL here and not above.return; if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), @@ -595,7 +596,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, conn = CONN(c->tcp.splice_conn_count++); conn->a = s; - conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0; + conn->flags = (sa.sa_family == AF_INET6) ? SOCK_V6 : 0; if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index, ref.r.p.tcp.tcp.outbound))
In the tcp_conn structure we have space for an IPv6 address, and use IPv4-mapped IPv6 addresses to describe IPv4 connections. We open code the construction of those IPv4-mapped address in two places. Avoid the duplication with a helper function. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 9 ++------- util.c | 20 ++++++++++++++++++++ util.h | 1 + 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/tcp.c b/tcp.c index 3d48d6e..26dd268 100644 --- a/tcp.c +++ b/tcp.c @@ -2217,9 +2217,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, sa = (struct sockaddr *)&addr4; sl = sizeof(addr4); - memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero)); - memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one)); - memcpy(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)); + encode_ip4mapped_ip6(&conn->a.a6, addr); } else { sa = (struct sockaddr *)&addr6; sl = sizeof(addr6); @@ -2902,15 +2900,12 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, memcpy(&sa4, &sa, sizeof(sa4)); - memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero)); - memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one)); - if (IN4_IS_ADDR_LOOPBACK(&sa4.sin_addr) || IN4_IS_ADDR_UNSPECIFIED(&sa4.sin_addr) || IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen)) sa4.sin_addr = c->ip4.gw; - conn->a.a4.a = sa4.sin_addr; + encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr); conn->sock_port = ntohs(sa4.sin_port); conn->tap_port = ref.r.p.tcp.tcp.index; diff --git a/util.c b/util.c index 514bd44..257d0b6 100644 --- a/util.c +++ b/util.c @@ -482,3 +482,23 @@ int write_file(const char *path, const char *buf) close(fd); return len == 0 ? 0 : -1; } + +struct ip4mapped_ip6 { + uint8_t zero[10]; + uint8_t one[2]; + struct in_addr a4; +}; + +/** + * encode_ip4mapped_ip6() - Convert an IPv4 address to an IPv4-mapped IPv6 address + * @ip6: Buffer to store the IPv4-mapped IPv6 address + * @ip4: IPv4 address, network order + */ +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4) +{ + struct ip4mapped_ip6 *a = (struct ip4mapped_ip6 *)ip6; + + memset(&a->zero, 0, sizeof(a->zero)); + memset(&a->one, 0xff, sizeof(a->one)); + memcpy(&a->a4, ip4, sizeof(a->a4)); +} diff --git a/util.h b/util.h index 2d4e1ff..f7d6a6f 100644 --- a/util.h +++ b/util.h @@ -209,5 +209,6 @@ void write_pidfile(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4); #endif /* UTIL_H */ -- 2.38.1
On Fri, 4 Nov 2022 19:43:25 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:In the tcp_conn structure we have space for an IPv6 address, and use IPv4-mapped IPv6 addresses to describe IPv4 connections. We open code the construction of those IPv4-mapped address in two places. Avoid the duplication with a helper function.Much nicer, indeed.Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 9 ++------- util.c | 20 ++++++++++++++++++++ util.h | 1 + 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/tcp.c b/tcp.c index 3d48d6e..26dd268 100644 --- a/tcp.c +++ b/tcp.c @@ -2217,9 +2217,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, sa = (struct sockaddr *)&addr4; sl = sizeof(addr4); - memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero)); - memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one)); - memcpy(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)); + encode_ip4mapped_ip6(&conn->a.a6, addr); } else { sa = (struct sockaddr *)&addr6; sl = sizeof(addr6); @@ -2902,15 +2900,12 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, memcpy(&sa4, &sa, sizeof(sa4)); - memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero)); - memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one)); - if (IN4_IS_ADDR_LOOPBACK(&sa4.sin_addr) || IN4_IS_ADDR_UNSPECIFIED(&sa4.sin_addr) || IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen)) sa4.sin_addr = c->ip4.gw; - conn->a.a4.a = sa4.sin_addr; + encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr); conn->sock_port = ntohs(sa4.sin_port); conn->tap_port = ref.r.p.tcp.tcp.index; diff --git a/util.c b/util.c index 514bd44..257d0b6 100644 --- a/util.c +++ b/util.c @@ -482,3 +482,23 @@ int write_file(const char *path, const char *buf) close(fd); return len == 0 ? 0 : -1; } + +struct ip4mapped_ip6 { + uint8_t zero[10]; + uint8_t one[2]; + struct in_addr a4; +};Document fields even if they're obvious. Can we reuse this part in struct conn instead of using struct in6_addr there as you do later in 7/10? I don't have a strong preference though.+ +/** + * encode_ip4mapped_ip6() - Convert an IPv4 address to an IPv4-mapped IPv6 address + * @ip6: Buffer to store the IPv4-mapped IPv6 address + * @ip4: IPv4 address, network order + */ +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4) +{ + struct ip4mapped_ip6 *a = (struct ip4mapped_ip6 *)ip6; + + memset(&a->zero, 0, sizeof(a->zero)); + memset(&a->one, 0xff, sizeof(a->one)); + memcpy(&a->a4, ip4, sizeof(a->a4)); +} diff --git a/util.h b/util.h index 2d4e1ff..f7d6a6f 100644 --- a/util.h +++ b/util.h @@ -209,5 +209,6 @@ void write_pidfile(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4); #endif /* UTIL_H */-- Stefano
On Mon, Nov 07, 2022 at 07:08:19PM +0100, Stefano Brivio wrote:On Fri, 4 Nov 2022 19:43:25 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Yeah, we could. Originally I was trying to make this even more hidden, with the decode function returning a struct in_addr *, that would be NULL if the IPv6 address wasn't v4-mapped. Unfortunately I seemed to hit more weird compiler aliasing issues that caused hard to predict failures. Hrm... your comment gives me some thoughts on maybe a better way to do this, I'll look into it.In the tcp_conn structure we have space for an IPv6 address, and use IPv4-mapped IPv6 addresses to describe IPv4 connections. We open code the construction of those IPv4-mapped address in two places. Avoid the duplication with a helper function.Much nicer, indeed.Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 9 ++------- util.c | 20 ++++++++++++++++++++ util.h | 1 + 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/tcp.c b/tcp.c index 3d48d6e..26dd268 100644 --- a/tcp.c +++ b/tcp.c @@ -2217,9 +2217,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, sa = (struct sockaddr *)&addr4; sl = sizeof(addr4); - memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero)); - memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one)); - memcpy(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)); + encode_ip4mapped_ip6(&conn->a.a6, addr); } else { sa = (struct sockaddr *)&addr6; sl = sizeof(addr6); @@ -2902,15 +2900,12 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, memcpy(&sa4, &sa, sizeof(sa4)); - memset(&conn->a.a4.zero, 0, sizeof(conn->a.a4.zero)); - memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one)); - if (IN4_IS_ADDR_LOOPBACK(&sa4.sin_addr) || IN4_IS_ADDR_UNSPECIFIED(&sa4.sin_addr) || IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen)) sa4.sin_addr = c->ip4.gw; - conn->a.a4.a = sa4.sin_addr; + encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr); conn->sock_port = ntohs(sa4.sin_port); conn->tap_port = ref.r.p.tcp.tcp.index; diff --git a/util.c b/util.c index 514bd44..257d0b6 100644 --- a/util.c +++ b/util.c @@ -482,3 +482,23 @@ int write_file(const char *path, const char *buf) close(fd); return len == 0 ? 0 : -1; } + +struct ip4mapped_ip6 { + uint8_t zero[10]; + uint8_t one[2]; + struct in_addr a4; +};Document fields even if they're obvious. Can we reuse this part in struct conn instead of using struct in6_addr there as you do later in 7/10? I don't have a strong preference though.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson+ +/** + * encode_ip4mapped_ip6() - Convert an IPv4 address to an IPv4-mapped IPv6 address + * @ip6: Buffer to store the IPv4-mapped IPv6 address + * @ip4: IPv4 address, network order + */ +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4) +{ + struct ip4mapped_ip6 *a = (struct ip4mapped_ip6 *)ip6; + + memset(&a->zero, 0, sizeof(a->zero)); + memset(&a->one, 0xff, sizeof(a->one)); + memcpy(&a->a4, ip4, sizeof(a->a4)); +} diff --git a/util.h b/util.h index 2d4e1ff..f7d6a6f 100644 --- a/util.h +++ b/util.h @@ -209,5 +209,6 @@ void write_pidfile(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4); #endif /* UTIL_H */
When given an IPv4 address tcp_hash_match() checks if the connection stores an IPv4-mapped IPv6 address, and if so compares the mapped part of that address to the given address. This is equivalent to converting a given IPv4 address to an IPv4-mapped IPv6 address then comparing it to the connection address using the existing IPv6 logic. Convert to this slightly simpler form, which will also allow some further simplifications in future. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tcp.c b/tcp.c index 26dd268..508d6e9 100644 --- a/tcp.c +++ b/tcp.c @@ -1285,13 +1285,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr, in_port_t tap_port, in_port_t sock_port) { - if (af == AF_INET && CONN_V4(conn) && - !memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)) && - conn->tap_port == tap_port && conn->sock_port == sock_port) - return 1; + struct in6_addr v4mapped; + + if (af == AF_INET) { + encode_ip4mapped_ip6(&v4mapped, addr); + addr = &v4mapped; + } - if (af == AF_INET6 && - IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && + if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && conn->tap_port == tap_port && conn->sock_port == sock_port) return 1; -- 2.38.1
On Fri, 4 Nov 2022 19:43:26 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:When given an IPv4 address tcp_hash_match() checks if the connection stores an IPv4-mapped IPv6 address, and if so compares the mapped part of that address to the given address. This is equivalent to converting a given IPv4 address to an IPv4-mapped IPv6 address then comparing it to the connection address using the existing IPv6 logic. Convert to this slightly simpler form, which will also allow some further simplifications in future. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tcp.c b/tcp.c index 26dd268..508d6e9 100644 --- a/tcp.c +++ b/tcp.c @@ -1285,13 +1285,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr, in_port_t tap_port, in_port_t sock_port) { - if (af == AF_INET && CONN_V4(conn) && - !memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)) && - conn->tap_port == tap_port && conn->sock_port == sock_port) - return 1; + struct in6_addr v4mapped; + + if (af == AF_INET) { + encode_ip4mapped_ip6(&v4mapped, addr); + addr = &v4mapped; + }It's probably worth the simplification, just mind that this replaces a 32-bit comparison with 128 bits of writes.- if (af == AF_INET6 && - IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && + if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && conn->tap_port == tap_port && conn->sock_port == sock_port) return 1;-- Stefano
On Mon, Nov 07, 2022 at 07:08:27PM +0100, Stefano Brivio wrote:On Fri, 4 Nov 2022 19:43:26 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:I'm aware, however this is mitigated later in the series. Once I make tcp_hash_match() take a (possibly v4mapped) v6 address only it's just a 128-bit comparison, which we already effectively do split between CONN_V4() and the v4 specific path. Obviously that means constructing the v4mapped address in the caller, but we were already doing that in at least one of the cases.When given an IPv4 address tcp_hash_match() checks if the connection stores an IPv4-mapped IPv6 address, and if so compares the mapped part of that address to the given address. This is equivalent to converting a given IPv4 address to an IPv4-mapped IPv6 address then comparing it to the connection address using the existing IPv6 logic. Convert to this slightly simpler form, which will also allow some further simplifications in future. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tcp.c b/tcp.c index 26dd268..508d6e9 100644 --- a/tcp.c +++ b/tcp.c @@ -1285,13 +1285,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr, in_port_t tap_port, in_port_t sock_port) { - if (af == AF_INET && CONN_V4(conn) && - !memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a)) && - conn->tap_port == tap_port && conn->sock_port == sock_port) - return 1; + struct in6_addr v4mapped; + + if (af == AF_INET) { + encode_ip4mapped_ip6(&v4mapped, addr); + addr = &v4mapped; + }It's probably worth the simplification, just mind that this replaces a 32-bit comparison with 128 bits of writes.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson- if (af == AF_INET6 && - IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && + if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && conn->tap_port == tap_port && conn->sock_port == sock_port) return 1;
In the tcp_conn structure, we represent IPv4 connections with IPv4-mapped IPv6 addresses in the single address field. However, we have different paths which will calculate different hashes for IPv4 and equivalent IPv4-mapped IPv6 addresses. This will cause problems for some future changes. Make the hash function work the same for these two cases, by always mapping IPv4 addresses into IPv4-mapped addresses before hashing. This involves some ugly temporaries, but later changes should improve this again. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- siphash.c | 1 + tcp.c | 33 +++++++++++++++------------------ 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/siphash.c b/siphash.c index 37a6d73..516a508 100644 --- a/siphash.c +++ b/siphash.c @@ -104,6 +104,7 @@ * * Return: the 64-bit hash output */ +/* cppcheck-suppress unusedFunction */ uint64_t siphash_8b(const uint8_t *in, const uint64_t *k) { PREAMBLE(8); diff --git a/tcp.c b/tcp.c index 508d6e9..4645004 100644 --- a/tcp.c +++ b/tcp.c @@ -1315,30 +1315,27 @@ __attribute__((__noinline__)) /* See comment in Makefile */ static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr, in_port_t tap_port, in_port_t sock_port) { + struct { + struct in6_addr addr; + in_port_t tap_port; + in_port_t sock_port; + } __attribute__((__packed__)) in = { + .tap_port = tap_port, + .sock_port = sock_port, + }; uint64_t b = 0; if (af == AF_INET) { - struct { - struct in_addr addr; - in_port_t tap_port; - in_port_t sock_port; - } __attribute__((__packed__)) in = { - *(struct in_addr *)addr, tap_port, sock_port, - }; - - b = siphash_8b((uint8_t *)&in, c->tcp.hash_secret); - } else if (af == AF_INET6) { - struct { - struct in6_addr addr; - in_port_t tap_port; - in_port_t sock_port; - } __attribute__((__packed__)) in = { - *(struct in6_addr *)addr, tap_port, sock_port, - }; + struct in6_addr v4mapped; - b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret); + encode_ip4mapped_ip6(&v4mapped, addr); + in.addr = v4mapped; + } else { + in.addr = *(struct in6_addr *)addr; } + b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret); + return (unsigned int)(b % TCP_HASH_TABLE_SIZE); } -- 2.38.1
tcp_hash_insert() takes an address to control which hash bucket the connection will go into. For IPv6 connections that address is alreadys stored in struct tcp_conn. For IPv4 connections, an equivalent IPv4-mapped IPv6 address is stored in struct tcp_conn. Now that we've made the hashing of IPv4 and IPv4-mapped IPv6 addresses equivalent, we can simplify tcp_hash_insert() to use the address in struct tcp_conn, rather than taking it as an extra parameter. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tcp.c b/tcp.c index 4645004..dfa73a3 100644 --- a/tcp.c +++ b/tcp.c @@ -1343,15 +1343,12 @@ static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr, * tcp_hash_insert() - Insert connection into hash table, chain link * @c: Execution context * @conn: Connection pointer - * @af: Address family, AF_INET or AF_INET6 - * @addr: Remote address, pointer to in_addr or in6_addr */ -static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn, - int af, const void *addr) +static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn) { int b; - b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port); + b = tcp_hash(c, AF_INET6, &conn->a.a6, conn->tap_port, conn->sock_port); conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1; tc_hash[b] = conn; conn->hash_bucket = b; @@ -2233,7 +2230,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now); conn->seq_ack_from_tap = conn->seq_to_tap + 1; - tcp_hash_insert(c, conn, af, addr); + tcp_hash_insert(c, conn); if (!bind(s, sa, sl)) { tcp_rst(c, conn); /* Nobody is listening then */ @@ -2891,8 +2888,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->sock_port, conn->tap_port, now); - - tcp_hash_insert(c, conn, AF_INET6, &sa6.sin6_addr); } else { struct sockaddr_in sa4; @@ -2912,10 +2907,10 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->sock_port, conn->tap_port, now); - - tcp_hash_insert(c, conn, AF_INET, &sa4.sin_addr); } + tcp_hash_insert(c, conn); + conn->seq_ack_from_tap = conn->seq_to_tap + 1; conn->wnd_from_tap = WINDOW_DEFAULT; -- 2.38.1
IPv4 are now hashed to match equivalent IPv4-mapped IPv6 addresses. This means we can avoid having IPv4 specific paths in the lower level hash and match functions, instead just dealing with the equivalent IPv4-mapped IPv6 addresses. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/tcp.c b/tcp.c index dfa73a3..3816a1c 100644 --- a/tcp.c +++ b/tcp.c @@ -1275,23 +1275,16 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, /** * tcp_hash_match() - Check if a connection entry matches address and ports * @conn: Connection entry to match against - * @af: Address family, AF_INET or AF_INET6 - * @addr: Remote address, pointer to in_addr or in6_addr + * @addr: Remote address, may be IPv4-mapped * @tap_port: tap-facing port * @sock_port: Socket-facing port * * Return: 1 on match, 0 otherwise */ -static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr, +static int tcp_hash_match(const struct tcp_conn *conn, + const struct in6_addr *addr, in_port_t tap_port, in_port_t sock_port) { - struct in6_addr v4mapped; - - if (af == AF_INET) { - encode_ip4mapped_ip6(&v4mapped, addr); - addr = &v4mapped; - } - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && conn->tap_port == tap_port && conn->sock_port == sock_port) return 1; @@ -1302,8 +1295,7 @@ static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr, /** * tcp_hash() - Calculate hash value for connection given address and ports * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @addr: Remote address, pointer to in_addr or in6_addr + * @addr: Remote address, may be IPv4-mapped * @tap_port: tap-facing port * @sock_port: Socket-facing port * @@ -1312,7 +1304,7 @@ static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr, #if TCP_HASH_NOINLINE __attribute__((__noinline__)) /* See comment in Makefile */ #endif -static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr, +static unsigned int tcp_hash(const struct ctx *c, const struct in6_addr *addr, in_port_t tap_port, in_port_t sock_port) { struct { @@ -1320,20 +1312,10 @@ static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr, in_port_t tap_port; in_port_t sock_port; } __attribute__((__packed__)) in = { - .tap_port = tap_port, - .sock_port = sock_port, + *addr, tap_port, sock_port, }; uint64_t b = 0; - if (af == AF_INET) { - struct in6_addr v4mapped; - - encode_ip4mapped_ip6(&v4mapped, addr); - in.addr = v4mapped; - } else { - in.addr = *(struct in6_addr *)addr; - } - b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret); return (unsigned int)(b % TCP_HASH_TABLE_SIZE); @@ -1348,7 +1330,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn) { int b; - b = tcp_hash(c, AF_INET6, &conn->a.a6, conn->tap_port, conn->sock_port); + b = tcp_hash(c, &conn->a.a6, conn->tap_port, conn->sock_port); conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1; tc_hash[b] = conn; conn->hash_bucket = b; @@ -1422,11 +1404,19 @@ static struct tcp_conn *tcp_hash_lookup(const struct ctx *c, int af, const void *addr, in_port_t tap_port, in_port_t sock_port) { - int b = tcp_hash(c, af, addr, tap_port, sock_port); + struct in6_addr v4mapped; struct tcp_conn *conn; + int b; + + if (af == AF_INET) { + encode_ip4mapped_ip6(&v4mapped, addr); + addr = &v4mapped; + } + + b = tcp_hash(c, addr, tap_port, sock_port); for (conn = tc_hash[b]; conn; conn = CONN_OR_NULL(conn->next_index)) { - if (tcp_hash_match(conn, af, addr, tap_port, sock_port)) + if (tcp_hash_match(conn, addr, tap_port, sock_port)) return conn; } -- 2.38.1
We now only extract the v4 part of the tcp_conn::a union in one place. Make this neater by using a helper to extract the IPv4 address from the mapped address in that place. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 40 +++++++++++++++------------------------- util.c | 13 +++++++++++++ util.h | 1 + 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/tcp.c b/tcp.c index 3816a1c..6634abb 100644 --- a/tcp.c +++ b/tcp.c @@ -416,10 +416,7 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ * @ws_to_tap: Window scaling factor advertised to tap/guest * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS * @seq_dup_ack_approx: Last duplicate ACK number sent to tap - * @a.a6: IPv6 remote address, can be IPv4-mapped - * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 - * @a.a4.one: Ones prefix for IPv4-mapped - * @a.a4.a: IPv4 address + * @addr: IPv6 remote address, can be IPv4-mapped * @tap_port: Guest-facing tap port * @sock_port: Remote, socket-facing port * @wnd_from_tap: Last window size from tap, unscaled (as received) @@ -489,15 +486,8 @@ struct tcp_conn { uint8_t seq_dup_ack_approx; - union { - struct in6_addr a6; - struct { - uint8_t zero[10]; - uint8_t one[2]; - struct in_addr a; - } a4; - } a; -#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6) + struct in6_addr addr; +#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->addr) #define CONN_V6(conn) (!CONN_V4(conn)) in_port_t tap_port; @@ -960,7 +950,7 @@ static int tcp_rtt_dst_low(const struct tcp_conn *conn) int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i)) + if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i)) return 1; return 0; @@ -982,7 +972,7 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn, return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i)) + if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -994,10 +984,10 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn, if (hole == -1) return; - memcpy(low_rtt_dst + hole++, &conn->a.a6, sizeof(conn->a.a6)); + memcpy(low_rtt_dst + hole++, &conn->addr, sizeof(conn->addr)); if (hole == LOW_RTT_TABLE_SIZE) hole = 0; - memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->a.a6)); + memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->addr)); #else (void)conn; (void)tinfo; @@ -1285,7 +1275,7 @@ static int tcp_hash_match(const struct tcp_conn *conn, const struct in6_addr *addr, in_port_t tap_port, in_port_t sock_port) { - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && + if (IN6_ARE_ADDR_EQUAL(&conn->addr, addr) && conn->tap_port == tap_port && conn->sock_port == sock_port) return 1; @@ -1330,7 +1320,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn) { int b; - b = tcp_hash(c, &conn->a.a6, conn->tap_port, conn->sock_port); + b = tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port); conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1; tc_hash[b] = conn; conn->hash_bucket = b; @@ -1660,7 +1650,7 @@ do { \ ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); - b->ip6h.saddr = conn->a.a6; + b->ip6h.saddr = conn->addr; if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr)) b->ip6h.daddr = c->ip6.addr_ll_seen; else @@ -1684,7 +1674,7 @@ do { \ ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); b->iph.tot_len = htons(ip_len); - b->iph.saddr = conn->a.a4.a.s_addr; + b->iph.saddr = decode_ip4mapped_ip6(&conn->addr).s_addr; b->iph.daddr = c->ip4.addr_seen.s_addr; if (check) @@ -2202,12 +2192,12 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, sa = (struct sockaddr *)&addr4; sl = sizeof(addr4); - encode_ip4mapped_ip6(&conn->a.a6, addr); + encode_ip4mapped_ip6(&conn->addr, addr); } else { sa = (struct sockaddr *)&addr6; sl = sizeof(addr6); - memcpy(&conn->a.a6, addr, sizeof(conn->a.a6)); + memcpy(&conn->addr, addr, sizeof(conn->addr)); } conn->sock_port = ntohs(th->dest); @@ -2869,7 +2859,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, memcpy(&sa6.sin6_addr, src, sizeof(*src)); } - memcpy(&conn->a.a6, &sa6.sin6_addr, sizeof(conn->a.a6)); + memcpy(&conn->addr, &sa6.sin6_addr, sizeof(conn->addr)); conn->sock_port = ntohs(sa6.sin6_port); conn->tap_port = ref.r.p.tcp.tcp.index; @@ -2888,7 +2878,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen)) sa4.sin_addr = c->ip4.gw; - encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr); + encode_ip4mapped_ip6(&conn->addr, &sa4.sin_addr); conn->sock_port = ntohs(sa4.sin_port); conn->tap_port = ref.r.p.tcp.tcp.index; diff --git a/util.c b/util.c index 257d0b6..ec7f57f 100644 --- a/util.c +++ b/util.c @@ -502,3 +502,16 @@ void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4) memset(&a->one, 0xff, sizeof(a->one)); memcpy(&a->a4, ip4, sizeof(a->a4)); } + +/** + * decode_ip4mapped_ip6() - Decode an IPv4-mapped IPv6 address + * @ip6: IPv6 address + * + * Returns: IPv4 address + */ +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6) +{ + const struct ip4mapped_ip6 *a = (const struct ip4mapped_ip6 *)ip6; + + return a->a4; +} diff --git a/util.h b/util.h index f7d6a6f..103211d 100644 --- a/util.h +++ b/util.h @@ -210,5 +210,6 @@ int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4); +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6); #endif /* UTIL_H */ -- 2.38.1
On Fri, 4 Nov 2022 19:43:30 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:We now only extract the v4 part of the tcp_conn::a union in one place. Make this neater by using a helper to extract the IPv4 address from the mapped address in that place. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 40 +++++++++++++++------------------------- util.c | 13 +++++++++++++ util.h | 1 + 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/tcp.c b/tcp.c index 3816a1c..6634abb 100644 --- a/tcp.c +++ b/tcp.c @@ -416,10 +416,7 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ * @ws_to_tap: Window scaling factor advertised to tap/guest * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS * @seq_dup_ack_approx: Last duplicate ACK number sent to tap - * @a.a6: IPv6 remote address, can be IPv4-mapped - * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 - * @a.a4.one: Ones prefix for IPv4-mapped - * @a.a4.a: IPv4 address + * @addr: IPv6 remote address, can be IPv4-mapped * @tap_port: Guest-facing tap port * @sock_port: Remote, socket-facing port * @wnd_from_tap: Last window size from tap, unscaled (as received) @@ -489,15 +486,8 @@ struct tcp_conn { uint8_t seq_dup_ack_approx; - union { - struct in6_addr a6; - struct { - uint8_t zero[10]; - uint8_t one[2]; - struct in_addr a; - } a4; - } a; -#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6) + struct in6_addr addr; +#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->addr) #define CONN_V6(conn) (!CONN_V4(conn)) in_port_t tap_port; @@ -960,7 +950,7 @@ static int tcp_rtt_dst_low(const struct tcp_conn *conn) int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i)) + if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i)) return 1; return 0; @@ -982,7 +972,7 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn, return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i)) + if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -994,10 +984,10 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn, if (hole == -1) return; - memcpy(low_rtt_dst + hole++, &conn->a.a6, sizeof(conn->a.a6)); + memcpy(low_rtt_dst + hole++, &conn->addr, sizeof(conn->addr)); if (hole == LOW_RTT_TABLE_SIZE) hole = 0; - memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->a.a6)); + memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->addr)); #else (void)conn; (void)tinfo; @@ -1285,7 +1275,7 @@ static int tcp_hash_match(const struct tcp_conn *conn, const struct in6_addr *addr, in_port_t tap_port, in_port_t sock_port) { - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && + if (IN6_ARE_ADDR_EQUAL(&conn->addr, addr) && conn->tap_port == tap_port && conn->sock_port == sock_port) return 1; @@ -1330,7 +1320,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn) { int b; - b = tcp_hash(c, &conn->a.a6, conn->tap_port, conn->sock_port); + b = tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port); conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1; tc_hash[b] = conn; conn->hash_bucket = b; @@ -1660,7 +1650,7 @@ do { \ ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); - b->ip6h.saddr = conn->a.a6; + b->ip6h.saddr = conn->addr; if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr)) b->ip6h.daddr = c->ip6.addr_ll_seen; else @@ -1684,7 +1674,7 @@ do { \ ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); b->iph.tot_len = htons(ip_len); - b->iph.saddr = conn->a.a4.a.s_addr; + b->iph.saddr = decode_ip4mapped_ip6(&conn->addr).s_addr; b->iph.daddr = c->ip4.addr_seen.s_addr; if (check) @@ -2202,12 +2192,12 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, sa = (struct sockaddr *)&addr4; sl = sizeof(addr4); - encode_ip4mapped_ip6(&conn->a.a6, addr); + encode_ip4mapped_ip6(&conn->addr, addr); } else { sa = (struct sockaddr *)&addr6; sl = sizeof(addr6); - memcpy(&conn->a.a6, addr, sizeof(conn->a.a6)); + memcpy(&conn->addr, addr, sizeof(conn->addr)); } conn->sock_port = ntohs(th->dest); @@ -2869,7 +2859,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, memcpy(&sa6.sin6_addr, src, sizeof(*src)); } - memcpy(&conn->a.a6, &sa6.sin6_addr, sizeof(conn->a.a6)); + memcpy(&conn->addr, &sa6.sin6_addr, sizeof(conn->addr)); conn->sock_port = ntohs(sa6.sin6_port); conn->tap_port = ref.r.p.tcp.tcp.index; @@ -2888,7 +2878,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen)) sa4.sin_addr = c->ip4.gw; - encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr); + encode_ip4mapped_ip6(&conn->addr, &sa4.sin_addr); conn->sock_port = ntohs(sa4.sin_port); conn->tap_port = ref.r.p.tcp.tcp.index; diff --git a/util.c b/util.c index 257d0b6..ec7f57f 100644 --- a/util.c +++ b/util.c @@ -502,3 +502,16 @@ void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4) memset(&a->one, 0xff, sizeof(a->one)); memcpy(&a->a4, ip4, sizeof(a->a4)); } + +/** + * decode_ip4mapped_ip6() - Decode an IPv4-mapped IPv6 address + * @ip6: IPv6 address + * + * Returns: IPv4 addressReturn: IPv4 address+ */ +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6) +{ + const struct ip4mapped_ip6 *a = (const struct ip4mapped_ip6 *)ip6;...if you use struct ip4_mapped_ip6 directly in struct conn, you could drop this cast, and perhaps this helper too, but I'm not sure if it has other implications.+ + return a->a4; +} diff --git a/util.h b/util.h index f7d6a6f..103211d 100644 --- a/util.h +++ b/util.h @@ -210,5 +210,6 @@ int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4); +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6); #endif /* UTIL_H */-- Stefano
On Mon, Nov 07, 2022 at 07:08:38PM +0100, Stefano Brivio wrote:On Fri, 4 Nov 2022 19:43:30 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Fixed. I note there are a few other instances of this mistake in the code, most by fault by the looks of it. I'll try to fix those others at some point.We now only extract the v4 part of the tcp_conn::a union in one place. Make this neater by using a helper to extract the IPv4 address from the mapped address in that place. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 40 +++++++++++++++------------------------- util.c | 13 +++++++++++++ util.h | 1 + 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/tcp.c b/tcp.c index 3816a1c..6634abb 100644 --- a/tcp.c +++ b/tcp.c @@ -416,10 +416,7 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ * @ws_to_tap: Window scaling factor advertised to tap/guest * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS * @seq_dup_ack_approx: Last duplicate ACK number sent to tap - * @a.a6: IPv6 remote address, can be IPv4-mapped - * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 - * @a.a4.one: Ones prefix for IPv4-mapped - * @a.a4.a: IPv4 address + * @addr: IPv6 remote address, can be IPv4-mapped * @tap_port: Guest-facing tap port * @sock_port: Remote, socket-facing port * @wnd_from_tap: Last window size from tap, unscaled (as received) @@ -489,15 +486,8 @@ struct tcp_conn { uint8_t seq_dup_ack_approx; - union { - struct in6_addr a6; - struct { - uint8_t zero[10]; - uint8_t one[2]; - struct in_addr a; - } a4; - } a; -#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6) + struct in6_addr addr; +#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->addr) #define CONN_V6(conn) (!CONN_V4(conn)) in_port_t tap_port; @@ -960,7 +950,7 @@ static int tcp_rtt_dst_low(const struct tcp_conn *conn) int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i)) + if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i)) return 1; return 0; @@ -982,7 +972,7 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn, return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i)) + if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -994,10 +984,10 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn, if (hole == -1) return; - memcpy(low_rtt_dst + hole++, &conn->a.a6, sizeof(conn->a.a6)); + memcpy(low_rtt_dst + hole++, &conn->addr, sizeof(conn->addr)); if (hole == LOW_RTT_TABLE_SIZE) hole = 0; - memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->a.a6)); + memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->addr)); #else (void)conn; (void)tinfo; @@ -1285,7 +1275,7 @@ static int tcp_hash_match(const struct tcp_conn *conn, const struct in6_addr *addr, in_port_t tap_port, in_port_t sock_port) { - if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && + if (IN6_ARE_ADDR_EQUAL(&conn->addr, addr) && conn->tap_port == tap_port && conn->sock_port == sock_port) return 1; @@ -1330,7 +1320,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn) { int b; - b = tcp_hash(c, &conn->a.a6, conn->tap_port, conn->sock_port); + b = tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port); conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1; tc_hash[b] = conn; conn->hash_bucket = b; @@ -1660,7 +1650,7 @@ do { \ ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); - b->ip6h.saddr = conn->a.a6; + b->ip6h.saddr = conn->addr; if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr)) b->ip6h.daddr = c->ip6.addr_ll_seen; else @@ -1684,7 +1674,7 @@ do { \ ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); b->iph.tot_len = htons(ip_len); - b->iph.saddr = conn->a.a4.a.s_addr; + b->iph.saddr = decode_ip4mapped_ip6(&conn->addr).s_addr; b->iph.daddr = c->ip4.addr_seen.s_addr; if (check) @@ -2202,12 +2192,12 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, sa = (struct sockaddr *)&addr4; sl = sizeof(addr4); - encode_ip4mapped_ip6(&conn->a.a6, addr); + encode_ip4mapped_ip6(&conn->addr, addr); } else { sa = (struct sockaddr *)&addr6; sl = sizeof(addr6); - memcpy(&conn->a.a6, addr, sizeof(conn->a.a6)); + memcpy(&conn->addr, addr, sizeof(conn->addr)); } conn->sock_port = ntohs(th->dest); @@ -2869,7 +2859,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, memcpy(&sa6.sin6_addr, src, sizeof(*src)); } - memcpy(&conn->a.a6, &sa6.sin6_addr, sizeof(conn->a.a6)); + memcpy(&conn->addr, &sa6.sin6_addr, sizeof(conn->addr)); conn->sock_port = ntohs(sa6.sin6_port); conn->tap_port = ref.r.p.tcp.tcp.index; @@ -2888,7 +2878,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen)) sa4.sin_addr = c->ip4.gw; - encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr); + encode_ip4mapped_ip6(&conn->addr, &sa4.sin_addr); conn->sock_port = ntohs(sa4.sin_port); conn->tap_port = ref.r.p.tcp.tcp.index; diff --git a/util.c b/util.c index 257d0b6..ec7f57f 100644 --- a/util.c +++ b/util.c @@ -502,3 +502,16 @@ void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4) memset(&a->one, 0xff, sizeof(a->one)); memcpy(&a->a4, ip4, sizeof(a->a4)); } + +/** + * decode_ip4mapped_ip6() - Decode an IPv4-mapped IPv6 address + * @ip6: IPv6 address + * + * Returns: IPv4 addressReturn: IPv4 addressTrue. As I said, I'm considering a slightly different approach now.+ */ +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6) +{ + const struct ip4mapped_ip6 *a = (const struct ip4mapped_ip6 *)ip6;...if you use struct ip4_mapped_ip6 directly in struct conn, you could drop this cast, and perhaps this helper too, but I'm not sure if it has other implications.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson+ + return a->a4; +} diff --git a/util.h b/util.h index f7d6a6f..103211d 100644 --- a/util.h +++ b/util.h @@ -210,5 +210,6 @@ int __daemon(int pidfile_fd, int devnull_fd); int fls(unsigned long x); int write_file(const char *path, const char *buf); void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4); +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6); #endif /* UTIL_H */
tcp_seq_init() has separate paths for IPv4 and IPv6 addresses. Convert it to convert IPv4 addresses to IPv4-mapped IPv6 addresses then compute the siphash as for IPv6. This is slightly simpler, and means that "true" IPv4 connections and "IPv6" connections using mapped addresses will have compatible sequence numbers. This will allow additional improvements in future. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- siphash.c | 1 + tcp.c | 46 +++++++++++++++++++--------------------------- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/siphash.c b/siphash.c index 516a508..811918b 100644 --- a/siphash.c +++ b/siphash.c @@ -123,6 +123,7 @@ uint64_t siphash_8b(const uint8_t *in, const uint64_t *k) * * Return: 32 bits obtained by XORing the two halves of the 64-bit hash output */ +/* cppcheck-suppress unusedFunction */ uint32_t siphash_12b(const uint8_t *in, const uint64_t *k) { uint32_t *in32 = (uint32_t *)in; diff --git a/tcp.c b/tcp.c index 6634abb..b9d0510 100644 --- a/tcp.c +++ b/tcp.c @@ -2011,38 +2011,30 @@ static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr, in_port_t dstport, in_port_t srcport, const struct timespec *now) { + struct { + struct in6_addr src; + in_port_t srcport; + struct in6_addr dst; + in_port_t dstport; + } __attribute__((__packed__)) in = { + .srcport = srcport, + .dstport = dstport, + }; uint32_t ns, seq = 0; if (af == AF_INET) { - struct { - struct in_addr src; - in_port_t srcport; - struct in_addr dst; - in_port_t dstport; - } __attribute__((__packed__)) in = { - .src = *(struct in_addr *)addr, - .srcport = srcport, - .dst = c->ip4.addr, - .dstport = dstport, - }; - - seq = siphash_12b((uint8_t *)&in, c->tcp.hash_secret); - } else if (af == AF_INET6) { - struct { - struct in6_addr src; - in_port_t srcport; - struct in6_addr dst; - in_port_t dstport; - } __attribute__((__packed__)) in = { - .src = *(struct in6_addr *)addr, - .srcport = srcport, - .dst = c->ip6.addr, - .dstport = dstport, - }; - - seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret); + struct in6_addr tmp; + encode_ip4mapped_ip6(&tmp, addr); + in.src = tmp; + encode_ip4mapped_ip6(&tmp, &c->ip4.addr); + in.dst = tmp; + } else { + in.src = *(struct in6_addr *)addr; + in.dst = c->ip6.addr; } + seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret); + ns = now->tv_sec * 1E9; ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */ -- 2.38.1
tcp_seq_init() takes a number of parameters for the connection, but at every call site, these are already populated in the tcp_conn structure. Likewise we always store the result into the @seq_to_tap field. Use this to simplify tcp_seq_init(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 43 +++++++++++++------------------------------ 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/tcp.c b/tcp.c index b9d0510..59e03ff 100644 --- a/tcp.c +++ b/tcp.c @@ -1999,17 +1999,11 @@ static void tcp_clamp_window(const struct ctx *c, struct tcp_conn *conn, /** * tcp_seq_init() - Calculate initial sequence number according to RFC 6528 * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @addr: Remote address, pointer to in_addr or in6_addr - * @dstport: Destination port, connection-wise, network order - * @srcport: Source port, connection-wise, network order + * @conn: TCP connection, with addr, sock_port and tap_port populated * @now: Current timestamp - * - * Return: initial TCP sequence */ -static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr, - in_port_t dstport, in_port_t srcport, - const struct timespec *now) +static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn, + const struct timespec *now) { struct { struct in6_addr src; @@ -2017,19 +2011,17 @@ static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr, struct in6_addr dst; in_port_t dstport; } __attribute__((__packed__)) in = { - .srcport = srcport, - .dstport = dstport, + .src = conn->addr, + .srcport = conn->tap_port, + .dstport = conn->sock_port, }; uint32_t ns, seq = 0; - if (af == AF_INET) { - struct in6_addr tmp; - encode_ip4mapped_ip6(&tmp, addr); - in.src = tmp; - encode_ip4mapped_ip6(&tmp, &c->ip4.addr); - in.dst = tmp; + if (CONN_V4(conn)) { + struct in6_addr v4mapped; + encode_ip4mapped_ip6(&v4mapped, &c->ip4.addr); + in.dst = v4mapped; } else { - in.src = *(struct in6_addr *)addr; in.dst = c->ip6.addr; } @@ -2038,7 +2030,7 @@ static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr, ns = now->tv_sec * 1E9; ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */ - return seq + ns; + conn->seq_to_tap = seq + ns; } /** @@ -2199,7 +2191,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr, conn->seq_from_tap = conn->seq_init_from_tap + 1; conn->seq_ack_to_tap = conn->seq_from_tap; - conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now); + tcp_seq_init(c, conn, now); conn->seq_ack_from_tap = conn->seq_to_tap + 1; tcp_hash_insert(c, conn); @@ -2855,11 +2847,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->sock_port = ntohs(sa6.sin6_port); conn->tap_port = ref.r.p.tcp.tcp.index; - - conn->seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6.sin6_addr, - conn->sock_port, - conn->tap_port, - now); } else { struct sockaddr_in sa4; @@ -2874,13 +2861,9 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->sock_port = ntohs(sa4.sin_port); conn->tap_port = ref.r.p.tcp.tcp.index; - - conn->seq_to_tap = tcp_seq_init(c, AF_INET, &sa4.sin_addr, - conn->sock_port, - conn->tap_port, - now); } + tcp_seq_init(c, conn, now); tcp_hash_insert(c, conn); conn->seq_ack_from_tap = conn->seq_to_tap + 1; -- 2.38.1
It looks like tcp_seq_init() is supposed to advance the sequence number by one every 32ns. However we only right shift the ns part of the timespec not the seconds part, meaning that we'll advance by an extra 32 steps on each second. I don't know if that's exploitable in any way, but it doesn't appear to be the intent, nor what RFC 6528 suggests. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 59e03ff..941fafb 100644 --- a/tcp.c +++ b/tcp.c @@ -2027,8 +2027,8 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn, seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret); - ns = now->tv_sec * 1E9; - ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */ + /* 32ns ticks, overflows 32 bits every 137s */ + ns = (now->tv_sec * 1E9 + now->tv_nsec) >> 5; conn->seq_to_tap = seq + ns; } -- 2.38.1
On Fri, 4 Nov 2022 19:43:33 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:It looks like tcp_seq_init() is supposed to advance the sequence number by one every 32ns. However we only right shift the ns part of the timespec not the seconds part, meaning that we'll advance by an extra 32 steps on each second. I don't know if that's exploitable in any way, but it doesn't appear to be the intent, nor what RFC 6528 suggests.Oh, oops, nice catch. Well, as long as the step, modulo 32 bits, is not 0, it's still arguably the 250 KHz / 4 µs period clock from RFC 793, so there's no practical difference (other than the overflow period). See also the note in RFC 1948: More precisely, RFC 793 specifies that the 32-bit counter be incremented by 1 in the low-order position about every 4 microseconds. Instead, Berkeley-derived kernels increment it by a constant every second [...] I kind of wonder if adding a time non-linearity like the unintended one I added actually increases entropy. But indeed ~4 seconds overflow is also not intended, and we should just stick to RFC 6528.Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 59e03ff..941fafb 100644 --- a/tcp.c +++ b/tcp.c @@ -2027,8 +2027,8 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn, seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret); - ns = now->tv_sec * 1E9; - ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */ + /* 32ns ticks, overflows 32 bits every 137s */ + ns = (now->tv_sec * 1E9 + now->tv_nsec) >> 5; conn->seq_to_tap = seq + ns; }-- Stefano
On Mon, Nov 07, 2022 at 07:08:46PM +0100, Stefano Brivio wrote:On Fri, 4 Nov 2022 19:43:33 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Well, except for the fact it's a 31.24 MHz / 32 ns clock. I assumed there was a good reason for that.It looks like tcp_seq_init() is supposed to advance the sequence number by one every 32ns. However we only right shift the ns part of the timespec not the seconds part, meaning that we'll advance by an extra 32 steps on each second. I don't know if that's exploitable in any way, but it doesn't appear to be the intent, nor what RFC 6528 suggests.Oh, oops, nice catch. Well, as long as the step, modulo 32 bits, is not 0, it's still arguably the 250 KHz / 4 µs period clock from RFC 793, so there's nopractical difference (other than the overflow period). See also the note in RFC 1948: More precisely, RFC 793 specifies that the 32-bit counter be incremented by 1 in the low-order position about every 4 microseconds. Instead, Berkeley-derived kernels increment it by a constant every second [...] I kind of wonder if adding a time non-linearity like the unintended one I added actually increases entropy.Right, I don't know.But indeed ~4 seconds overflow is also not intended, and we should just stick to RFC 6528.Well... 4s overflow, yes, but not I think a 4s period before repeating. Again, I don't know enough about this stuff to analyze whether that's important or not.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonSigned-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 59e03ff..941fafb 100644 --- a/tcp.c +++ b/tcp.c @@ -2027,8 +2027,8 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn, seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret); - ns = now->tv_sec * 1E9; - ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */ + /* 32ns ticks, overflows 32 bits every 137s */ + ns = (now->tv_sec * 1E9 + now->tv_nsec) >> 5; conn->seq_to_tap = seq + ns; }