As with TCP, it turns out that there are a bunch of clean ups and reworks to the ICMP code which will make integration with the flow table easier, even before introducing a non-trivial version of the flow table itself. Based on the flow based dispatch/allocation, and bind/addressing cleanup series. Changes since v2: * Rebased on current main and revised flow dispatch series * Standardise on the name 'id_sock' instead of 'id_map' for pointers to specific entries in the id_map * Avoid confusing usage of the word "sequence" to mean a flow of ping packets as opposed to the ping sequence number. * Use packet_get() rather than explicit comparisons to validate packet lengths Changes since v1: * Rebased on newer version of flow dispatch & allocation series * Added 12/12 splitting out close and new sequence functions David Gibson (11): icmp: Don't set "port" on destination sockaddr for ping sockets icmp: Remove redundant initialisation of sendto() address icmp: Don't attempt to handle "wrong direction" ping socket traffic icmp: Don't attempt to match host IDs to guest IDs icmp: Use -1 to represent "missing" sockets icmp: Simplify socket expiry scanning icmp: Share more between IPv4 and IPv6 paths in icmp_tap_handler() icmp: Consolidate icmp_sock_handler() with icmpv6_sock_handler() icmp: Warn on receive errors from ping sockets icmp: Validate packets received on ping sockets icmp: Dedicated functions for starting and closing ping sequences icmp.c | 329 ++++++++++++++++++++++++++++---------------------------- icmp.h | 5 +- passt.c | 4 +- 3 files changed, 166 insertions(+), 172 deletions(-) -- 2.43.0
We set the port to the ICMP id on the sendto() address when using ICMP ping sockets. However, this has no effect: the ICMP id the kernel uses is determined only by the "port" on the socket's *bound* address (which is constructed inside sock_l4(), using the id we also pass to it). For unclear reasons this change triggers cppcheck 2.13.0 to give new "variable could be const pointer" warnings, so make *ih const as well to fix that. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/icmp.c b/icmp.c index 325dfb0..ca039f0 100644 --- a/icmp.c +++ b/icmp.c @@ -172,7 +172,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, .sin_addr = IN4ADDR_ANY_INIT, }; union icmp_epoll_ref iref; - struct icmphdr *ih; + const struct icmphdr *ih; int id, s; ih = packet_get(p, 0, 0, sizeof(*ih), &plen); @@ -182,8 +182,6 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, if (ih->type != ICMP_ECHO && ih->type != ICMP_ECHOREPLY) return 1; - sa.sin_port = ih->un.echo.id; - iref.id = id = ntohs(ih->un.echo.id); if ((s = icmp_id_map[V4][id].sock) <= 0) { @@ -219,7 +217,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, .sin6_scope_id = c->ifi6, }; union icmp_epoll_ref iref; - struct icmp6hdr *ih; + const struct icmp6hdr *ih; int id, s; ih = packet_get(p, 0, 0, sizeof(struct icmp6hdr), &plen); @@ -229,8 +227,6 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, if (ih->icmp6_type != 128 && ih->icmp6_type != 129) return 1; - sa.sin6_port = ih->icmp6_identifier; - iref.id = id = ntohs(ih->icmp6_identifier); if ((s = icmp_id_map[V6][id].sock) <= 0) { s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, -- 2.43.0
We initialise the address portion of the sockaddr for sendto() to the unspecified address, but then always overwrite it with the actual destination address before we call the sendto(). Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/icmp.c b/icmp.c index ca039f0..ed1a3d9 100644 --- a/icmp.c +++ b/icmp.c @@ -169,7 +169,6 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, if (af == AF_INET) { struct sockaddr_in sa = { .sin_family = AF_INET, - .sin_addr = IN4ADDR_ANY_INIT, }; union icmp_epoll_ref iref; const struct icmphdr *ih; @@ -213,7 +212,6 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, } else if (af == AF_INET6) { struct sockaddr_in6 sa = { .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_ANY_INIT, .sin6_scope_id = c->ifi6, }; union icmp_epoll_ref iref; -- 2.43.0
Linux ICMP "ping" sockets are very specific in what they do. They let userspace send ping requests (ICMP_ECHO or ICMP6_ECHO_REQUEST), and receive matching replies (ICMP_ECHOREPLY or ICMP6_ECHO_REPLY). They don't let you intercept or handle incoming ping requests. In the case of passt/pasta that means we can process echo requests from tap and forward them to a ping socket, then take the replies from the ping socket and forward them to tap. We can't do the reverse: take echo requests from the host and somehow forward them to the guest. There's really no way for something outside to initiate a ping to a passt/pasta connected guest and if there was we'd need an entirely different mechanism to handle it. However, we have some logic to deal with packets going in that reverse direction. Remove it, since it can't ever be used that way. While we're there use defines for the ICMPv6 types, instead of open coded type values. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/icmp.c b/icmp.c index ed1a3d9..697a336 100644 --- a/icmp.c +++ b/icmp.c @@ -93,8 +93,7 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) icmp_id_map[V4][id].seq = seq; } - debug("ICMP: echo %s to tap, ID: %i, seq: %i", - (ih->type == ICMP_ECHO) ? "request" : "reply", id, seq); + debug("ICMP: echo reply to tap, ID: %i, seq: %i", id, seq); tap_icmp4_send(c, sr.sin_addr, tap_ip4_daddr(c), buf, n); } @@ -138,8 +137,7 @@ void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref) icmp_id_map[V6][id].seq = seq; } - debug("ICMPv6: echo %s to tap, ID: %i, seq: %i", - (ih->icmp6_type == 128) ? "request" : "reply", id, seq); + debug("ICMPv6: echo reply to tap, ID: %i, seq: %i", id, seq); tap_icmp6_send(c, &sr.sin6_addr, tap_ip6_daddr(c, &sr.sin6_addr), buf, n); @@ -178,7 +176,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, if (!ih) return 1; - if (ih->type != ICMP_ECHO && ih->type != ICMP_ECHOREPLY) + if (ih->type != ICMP_ECHO) return 1; iref.id = id = ntohs(ih->un.echo.id); @@ -205,8 +203,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, (struct sockaddr *)&sa, sizeof(sa)) < 0) { debug("ICMP: failed to relay request to socket"); } else { - debug("ICMP: echo %s to socket, ID: %i, seq: %i", - (ih->type == ICMP_ECHO) ? "request" : "reply", + debug("ICMP: echo request to socket, ID: %i, seq: %i", id, ntohs(ih->un.echo.sequence)); } } else if (af == AF_INET6) { @@ -222,7 +219,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, if (!ih) return 1; - if (ih->icmp6_type != 128 && ih->icmp6_type != 129) + if (ih->icmp6_type != ICMPV6_ECHO_REQUEST) return 1; iref.id = id = ntohs(ih->icmp6_identifier); @@ -249,8 +246,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, (struct sockaddr *)&sa, sizeof(sa)) < 1) { debug("ICMPv6: failed to relay request to socket"); } else { - debug("ICMPv6: echo %s to socket, ID: %i, seq: %i", - (ih->icmp6_type == 128) ? "request" : "reply", + debug("ICMPv6: echo request to socket, ID: %i, seq: %i", id, ntohs(ih->icmp6_sequence)); } } -- 2.43.0
When forwarding pings from tap, currently we create a ping socket with a socket address whose port is set to the ID of the ping received from the guest. This causes the socket to send pings with the same ID on the host. Although this seems look a good idea for maximum transparency, it's probably unwise. First, it's fallible - the bind() could fail, and we already have fallback logic which will overwrite the packets with the expected guest id if the id we get on replies doesn't already match. We might as well do that unconditionally. But more importantly, we don't know what else on the host might be using ping sockets, so we could end up with an ID that's the same as an existing socket. You'd expect that to fail the bind() with EADDRINUSE, which would be fine: we'd fall back to rewriting the reply ids. However it appears the kernel (v6.6.3 at least), does *not* fail the bind() and instead it's "last socket wins" in terms of who gets the replies. So we could accidentally intercept ping replies for something else on the host. So, instead of using bind() to set the id, just let the kernel pick one and expect to translate the replies back. Although theoretically this makes the passt/pasta link a bit less "transparent", essentially nothing cares about specific ping IDs, much like TCP source ports, which we also don't preserve. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/icmp.c b/icmp.c index 697a336..2407daf 100644 --- a/icmp.c +++ b/icmp.c @@ -80,11 +80,12 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) if (n < 0) return; - id = ntohs(ih->un.echo.id); seq = ntohs(ih->un.echo.sequence); - if (id != ref.icmp.id) - ih->un.echo.id = htons(ref.icmp.id); + /* Adjust the packet to have the ID the guest was using, rather than the + * host chosen value */ + id = ref.icmp.id; + ih->un.echo.id = htons(id); if (c->mode == MODE_PASTA) { if (icmp_id_map[V4][id].seq == seq) @@ -119,15 +120,12 @@ void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref) if (n < 0) return; - id = ntohs(ih->icmp6_identifier); seq = ntohs(ih->icmp6_sequence); - /* If bind() fails e.g. because of a broken SELinux policy, - * this might happen. Fix up the identifier to match the sent - * one. - */ - if (id != ref.icmp.id) - ih->icmp6_identifier = htons(ref.icmp.id); + /* Adjust the packet to have the ID the guest was using, rather than the + * host chosen value */ + id = ref.icmp.id; + ih->icmp6_identifier = htons(id); /* In PASTA mode, we'll get any reply we send, discard them. */ if (c->mode == MODE_PASTA) { @@ -183,7 +181,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, if ((s = icmp_id_map[V4][id].sock) <= 0) { s = sock_l4(c, AF_INET, IPPROTO_ICMP, &c->ip4.addr_out, - c->ip4.ifname_out, id, iref.u32); + c->ip4.ifname_out, 0, iref.u32); if (s < 0) goto fail_sock; if (s > FD_REF_MAX) { @@ -226,7 +224,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, if ((s = icmp_id_map[V6][id].sock) <= 0) { s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, &c->ip6.addr_out, - c->ip6.ifname_out, id, iref.u32); + c->ip6.ifname_out, 0, iref.u32); if (s < 0) goto fail_sock; if (s > FD_REF_MAX) { -- 2.43.0
icmp_id_map[] contains, amongst other things, fds for "ping" sockets associated with various ICMP echo ids. However, we only lazily open() those sockets, so many will be missing. We currently represent that with a 0, which isn't great, since that's technically a valid fd. Use -1 instead. This does require initializing the fields in icmp_id_map[] but we already have an obvious place to do that. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/icmp.c b/icmp.c index 2407daf..01ef34a 100644 --- a/icmp.c +++ b/icmp.c @@ -179,7 +179,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, iref.id = id = ntohs(ih->un.echo.id); - if ((s = icmp_id_map[V4][id].sock) <= 0) { + if ((s = icmp_id_map[V4][id].sock) < 0) { s = sock_l4(c, AF_INET, IPPROTO_ICMP, &c->ip4.addr_out, c->ip4.ifname_out, 0, iref.u32); if (s < 0) @@ -221,7 +221,7 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, return 1; iref.id = id = ntohs(ih->icmp6_identifier); - if ((s = icmp_id_map[V6][id].sock) <= 0) { + if ((s = icmp_id_map[V6][id].sock) < 0) { s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, &c->ip6.addr_out, c->ip6.ifname_out, 0, iref.u32); @@ -277,7 +277,7 @@ static void icmp_timer_one(const struct ctx *c, int v6, uint16_t id, epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_map->sock, NULL); close(id_map->sock); - id_map->sock = 0; + id_map->sock = -1; id_map->seq = -1; } @@ -315,6 +315,8 @@ void icmp_init(void) { unsigned i; - for (i = 0; i < ICMP_NUM_IDS; i++) + for (i = 0; i < ICMP_NUM_IDS; i++) { icmp_id_map[V4][i].seq = icmp_id_map[V6][i].seq = -1; + icmp_id_map[V4][i].sock = icmp_id_map[V6][i].sock = -1; + } } -- 2.43.0
Currently we use icmp_act[] to scan for ICMP ids which might have an open socket which could time out. However icmp_act[] contains no information that's not already in icmp_id_map[] - it's just an "index" which allows scanning for relevant entries with less cache footprint. We only scan for ICMP socket expiry every 1s, though, so it's not clear that cache footprint really matters. Furthermore, there's no strong reason we need to scan even that often - the timeout is fairly arbitrary and approximate. So, eliminate icmp_act[] in favour of directly scanning icmp_id_map[] and compensate for the cache impact by reducing the scan frequency to once every 10s. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 42 ++++++++++-------------------------------- icmp.h | 2 +- 2 files changed, 11 insertions(+), 33 deletions(-) diff --git a/icmp.c b/icmp.c index 01ef34a..757ca61 100644 --- a/icmp.c +++ b/icmp.c @@ -56,9 +56,6 @@ struct icmp_id_sock { /* Indexed by ICMP echo identifier */ static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS]; -/* Bitmaps, activity monitoring needed for identifier */ -static uint8_t icmp_act[IP_VERSIONS][DIV_ROUND_UP(ICMP_NUM_IDS, 8)]; - /** * icmp_sock_handler() - Handle new data from IPv4 ICMP socket * @c: Execution context @@ -194,7 +191,6 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, debug("ICMP: new socket %i for echo ID %i", s, id); } icmp_id_map[V4][id].ts = now->tv_sec; - bitmap_set(icmp_act[V4], id); sa.sin_addr = *(struct in_addr *)daddr; if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, @@ -237,7 +233,6 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, debug("ICMPv6: new socket %i for echo ID %i", s, id); } icmp_id_map[V6][id].ts = now->tv_sec; - bitmap_set(icmp_act[V6], id); sa.sin6_addr = *(struct in6_addr *)daddr; if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, @@ -261,24 +256,19 @@ fail_sock: /** * icmp_timer_one() - Handler for timed events related to a given identifier * @c: Execution context - * @v6: Set for IPv6 echo identifier bindings - * @id: Echo identifier, host order + * @id_sock: Socket fd and activity timestamp * @now: Current timestamp */ -static void icmp_timer_one(const struct ctx *c, int v6, uint16_t id, +static void icmp_timer_one(const struct ctx *c, struct icmp_id_sock *id_sock, const struct timespec *now) { - struct icmp_id_sock *id_map = &icmp_id_map[v6 ? V6 : V4][id]; - - if (now->tv_sec - id_map->ts <= ICMP_ECHO_TIMEOUT) + if (id_sock->sock < 0 || now->tv_sec - id_sock->ts <= ICMP_ECHO_TIMEOUT) return; - bitmap_clear(icmp_act[v6 ? V6 : V4], id); - - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_map->sock, NULL); - close(id_map->sock); - id_map->sock = -1; - id_map->seq = -1; + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_sock->sock, NULL); + close(id_sock->sock); + id_sock->sock = -1; + id_sock->seq = -1; } /** @@ -288,23 +278,11 @@ static void icmp_timer_one(const struct ctx *c, int v6, uint16_t id, */ void icmp_timer(const struct ctx *c, const struct timespec *now) { - long *word, tmp; unsigned int i; - int n, v6 = 0; - -v6: - word = (long *)icmp_act[v6 ? V6 : V4]; - for (i = 0; i < ARRAY_SIZE(icmp_act); i += sizeof(long), word++) { - tmp = *word; - while ((n = ffsl(tmp))) { - tmp &= ~(1UL << (n - 1)); - icmp_timer_one(c, v6, i * 8 + n - 1, now); - } - } - if (!v6) { - v6 = 1; - goto v6; + for (i = 0; i < ICMP_NUM_IDS; i++) { + icmp_timer_one(c, &icmp_id_map[V4][i], now); + icmp_timer_one(c, &icmp_id_map[V6][i], now); } } diff --git a/icmp.h b/icmp.h index 1a08594..4096c65 100644 --- a/icmp.h +++ b/icmp.h @@ -6,7 +6,7 @@ #ifndef ICMP_H #define ICMP_H -#define ICMP_TIMER_INTERVAL 1000 /* ms */ +#define ICMP_TIMER_INTERVAL 10000 /* ms */ struct ctx; -- 2.43.0
Currently icmp_tap_handler() consists of two almost disjoint paths for the IPv4 and IPv6 cases. The only thing they share is an error message. We can use some intermediate variables to refactor this to share some more code between those paths. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 136 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/icmp.c b/icmp.c index 757ca61..40edd55 100644 --- a/icmp.c +++ b/icmp.c @@ -154,102 +154,102 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now) { + uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6; + const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; + union { + struct sockaddr sa; + struct sockaddr_in sa4; + struct sockaddr_in6 sa6; + } sa = { .sa.sa_family = af }; + const socklen_t sl = af == AF_INET ? sizeof(sa.sa4) : sizeof(sa.sa6); + struct icmp_id_sock *id_sock; + uint16_t id, seq; size_t plen; + void *pkt; + int s; (void)saddr; (void)pif; if (af == AF_INET) { - struct sockaddr_in sa = { - .sin_family = AF_INET, - }; - union icmp_epoll_ref iref; const struct icmphdr *ih; - int id, s; - ih = packet_get(p, 0, 0, sizeof(*ih), &plen); - if (!ih) + if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &plen))) return 1; + ih = (struct icmphdr *)pkt; + plen += sizeof(*ih); + if (ih->type != ICMP_ECHO) return 1; - iref.id = id = ntohs(ih->un.echo.id); + id = ntohs(ih->un.echo.id); + id_sock = &icmp_id_map[V4][id]; + seq = ntohs(ih->un.echo.sequence); + sa.sa4.sin_addr = *(struct in_addr *)daddr; + } else if (af == AF_INET6) { + const struct icmp6hdr *ih; - if ((s = icmp_id_map[V4][id].sock) < 0) { - s = sock_l4(c, AF_INET, IPPROTO_ICMP, &c->ip4.addr_out, - c->ip4.ifname_out, 0, iref.u32); - if (s < 0) - goto fail_sock; - if (s > FD_REF_MAX) { - close(s); - return 1; - } + if (!(pkt = packet_get(p, 0, 0, sizeof(*ih), &plen))) + return 1; - icmp_id_map[V4][id].sock = s; + ih = (struct icmp6hdr *)pkt; + plen += sizeof(*ih); - debug("ICMP: new socket %i for echo ID %i", s, id); - } - icmp_id_map[V4][id].ts = now->tv_sec; + if (ih->icmp6_type != ICMPV6_ECHO_REQUEST) + return 1; + + id = ntohs(ih->icmp6_identifier); + id_sock = &icmp_id_map[V6][id]; + seq = ntohs(ih->icmp6_sequence); + sa.sa6.sin6_addr = *(struct in6_addr *)daddr; + sa.sa6.sin6_scope_id = c->ifi6; + } else { + ASSERT(0); + } - sa.sin_addr = *(struct in_addr *)daddr; - if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, - (struct sockaddr *)&sa, sizeof(sa)) < 0) { - debug("ICMP: failed to relay request to socket"); + if ((s = id_sock->sock) < 0) { + union icmp_epoll_ref iref = { .id = id }; + const void *bind_addr; + const char *bind_if; + + if (af == AF_INET) { + bind_addr = &c->ip4.addr_out; + bind_if = c->ip4.ifname_out; } else { - debug("ICMP: echo request to socket, ID: %i, seq: %i", - id, ntohs(ih->un.echo.sequence)); + bind_addr = &c->ip6.addr_out; + bind_if = c->ip6.ifname_out; } - } else if (af == AF_INET6) { - struct sockaddr_in6 sa = { - .sin6_family = AF_INET6, - .sin6_scope_id = c->ifi6, - }; - union icmp_epoll_ref iref; - const struct icmp6hdr *ih; - int id, s; - ih = packet_get(p, 0, 0, sizeof(struct icmp6hdr), &plen); - if (!ih) - return 1; + s = sock_l4(c, af, proto, bind_addr, bind_if, 0, iref.u32); - if (ih->icmp6_type != ICMPV6_ECHO_REQUEST) + if (s < 0) { + warn("Cannot open \"ping\" socket. You might need to:"); + warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\""); + warn("...echo requests/replies will fail."); return 1; - - iref.id = id = ntohs(ih->icmp6_identifier); - if ((s = icmp_id_map[V6][id].sock) < 0) { - s = sock_l4(c, AF_INET6, IPPROTO_ICMPV6, - &c->ip6.addr_out, - c->ip6.ifname_out, 0, iref.u32); - if (s < 0) - goto fail_sock; - if (s > FD_REF_MAX) { - close(s); - return 1; - } - - icmp_id_map[V6][id].sock = s; - - debug("ICMPv6: new socket %i for echo ID %i", s, id); } - icmp_id_map[V6][id].ts = now->tv_sec; - sa.sin6_addr = *(struct in6_addr *)daddr; - if (sendto(s, ih, sizeof(*ih) + plen, MSG_NOSIGNAL, - (struct sockaddr *)&sa, sizeof(sa)) < 1) { - debug("ICMPv6: failed to relay request to socket"); - } else { - debug("ICMPv6: echo request to socket, ID: %i, seq: %i", - id, ntohs(ih->icmp6_sequence)); + if (s > FD_REF_MAX) { + close(s); + return 1; } + + id_sock->sock = s; + + debug("%s: new socket %i for echo ID %"PRIu16, pname, s, id); } - return 1; + id_sock->ts = now->tv_sec; + + if (sendto(s, pkt, plen, MSG_NOSIGNAL, &sa.sa, sl) < 0) { + debug("%s: failed to relay request to socket: %s", + pname, strerror(errno)); + } else { + debug("%s: echo request to socket, ID: %"PRIu16", seq: %"PRIu16, + pname, id, seq); + } -fail_sock: - warn("Cannot open \"ping\" socket. You might need to:"); - warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\""); - warn("...echo requests/replies will fail."); return 1; } -- 2.43.0
Currently we have separate handlers for ICMP and ICMPv6 ping replies. Although there are a number of points of difference, with some creative refactoring we can combine these together sensibly. Although it doesn't save a vast amount of code, it does make it clearer that we're performing basically the same steps for each case. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 89 ++++++++++++++++++++++----------------------------------- icmp.h | 3 +- passt.c | 4 +-- 3 files changed, 37 insertions(+), 59 deletions(-) diff --git a/icmp.c b/icmp.c index 40edd55..88390fb 100644 --- a/icmp.c +++ b/icmp.c @@ -57,85 +57,64 @@ struct icmp_id_sock { static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS]; /** - * icmp_sock_handler() - Handle new data from IPv4 ICMP socket + * icmp_sock_handler() - Handle new data from ICMP or ICMPv6 socket * @c: Execution context + * @af: Address family (AF_INET or AF_INET6) * @ref: epoll reference */ -void icmp_sock_handler(const struct ctx *c, union epoll_ref ref) +void icmp_sock_handler(const struct ctx *c, int af, union epoll_ref ref) { + struct icmp_id_sock *const id_sock = af == AF_INET + ? &icmp_id_map[V4][ref.icmp.id] : &icmp_id_map[V6][ref.icmp.id]; + const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; char buf[USHRT_MAX]; - struct icmphdr *ih = (struct icmphdr *)buf; - struct sockaddr_in sr; + union { + struct sockaddr sa; + struct sockaddr_in sa4; + struct sockaddr_in6 sa6; + } sr; socklen_t sl = sizeof(sr); - uint16_t seq, id; + uint16_t seq; ssize_t n; if (c->no_icmp) return; - n = recvfrom(ref.fd, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl); + n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl); if (n < 0) return; - seq = ntohs(ih->un.echo.sequence); - - /* Adjust the packet to have the ID the guest was using, rather than the - * host chosen value */ - id = ref.icmp.id; - ih->un.echo.id = htons(id); + if (af == AF_INET) { + struct icmphdr *ih4 = (struct icmphdr *)buf; - if (c->mode == MODE_PASTA) { - if (icmp_id_map[V4][id].seq == seq) - return; + /* Adjust packet back to guest-side ID */ + ih4->un.echo.id = htons(ref.icmp.id); + seq = ntohs(ih4->un.echo.sequence); + } else if (af == AF_INET6) { + struct icmp6hdr *ih6 = (struct icmp6hdr *)buf; - icmp_id_map[V4][id].seq = seq; + /* Adjust packet back to guest-side ID */ + ih6->icmp6_identifier = htons(ref.icmp.id); + seq = ntohs(ih6->icmp6_sequence); + } else { + ASSERT(0); } - debug("ICMP: echo reply to tap, ID: %i, seq: %i", id, seq); - - tap_icmp4_send(c, sr.sin_addr, tap_ip4_daddr(c), buf, n); -} - -/** - * icmpv6_sock_handler() - Handle new data from ICMPv6 socket - * @c: Execution context - * @ref: epoll reference - */ -void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref) -{ - char buf[USHRT_MAX]; - struct icmp6hdr *ih = (struct icmp6hdr *)buf; - struct sockaddr_in6 sr; - socklen_t sl = sizeof(sr); - uint16_t seq, id; - ssize_t n; - - if (c->no_icmp) - return; - - n = recvfrom(ref.fd, buf, sizeof(buf), 0, (struct sockaddr *)&sr, &sl); - if (n < 0) - return; - - seq = ntohs(ih->icmp6_sequence); - - /* Adjust the packet to have the ID the guest was using, rather than the - * host chosen value */ - id = ref.icmp.id; - ih->icmp6_identifier = htons(id); - /* In PASTA mode, we'll get any reply we send, discard them. */ if (c->mode == MODE_PASTA) { - if (icmp_id_map[V6][id].seq == seq) + if (id_sock->seq == seq) return; - icmp_id_map[V6][id].seq = seq; + id_sock->seq = seq; } - debug("ICMPv6: echo reply to tap, ID: %i, seq: %i", id, seq); - - tap_icmp6_send(c, &sr.sin6_addr, - tap_ip6_daddr(c, &sr.sin6_addr), buf, n); + debug("%s: echo reply to tap, ID: %"PRIu16", seq: %"PRIu16, pname, + ref.icmp.id, seq); + if (af == AF_INET) + tap_icmp4_send(c, sr.sa4.sin_addr, tap_ip4_daddr(c), buf, n); + else if (af == AF_INET6) + tap_icmp6_send(c, &sr.sa6.sin6_addr, + tap_ip6_daddr(c, &sr.sa6.sin6_addr), buf, n); } /** diff --git a/icmp.h b/icmp.h index 4096c65..0083597 100644 --- a/icmp.h +++ b/icmp.h @@ -10,8 +10,7 @@ struct ctx; -void icmp_sock_handler(const struct ctx *c, union epoll_ref ref); -void icmpv6_sock_handler(const struct ctx *c, union epoll_ref ref); +void icmp_sock_handler(const struct ctx *c, int af, union epoll_ref ref); int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now); diff --git a/passt.c b/passt.c index d315438..44d3a0b 100644 --- a/passt.c +++ b/passt.c @@ -392,10 +392,10 @@ loop: udp_sock_handler(&c, ref, eventmask, &now); break; case EPOLL_TYPE_ICMP: - icmp_sock_handler(&c, ref); + icmp_sock_handler(&c, AF_INET, ref); break; case EPOLL_TYPE_ICMPV6: - icmpv6_sock_handler(&c, ref); + icmp_sock_handler(&c, AF_INET6, ref); break; default: /* Can't happen */ -- 2.43.0
Currently we silently ignore an errors receiving a packet from a ping socket. We don't expect that to happen, so it's probably worth reporting if it does. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/icmp.c b/icmp.c index 88390fb..79f6c8c 100644 --- a/icmp.c +++ b/icmp.c @@ -81,8 +81,11 @@ void icmp_sock_handler(const struct ctx *c, int af, union epoll_ref ref) return; n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl); - if (n < 0) + if (n < 0) { + warn("%s: recvfrom() error on ping socket: %s", + pname, strerror(errno)); return; + } if (af == AF_INET) { struct icmphdr *ih4 = (struct icmphdr *)buf; -- 2.43.0
We access fields of packets received from ping sockets assuming they're echo replies, without actually checking that. Of course, we don't expect anything else from the kernel, but it's probably best to verify. While we're at it, also check for short packets, or a receive address of the wrong family. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/icmp.c b/icmp.c index 79f6c8c..a9dc436 100644 --- a/icmp.c +++ b/icmp.c @@ -86,16 +86,25 @@ void icmp_sock_handler(const struct ctx *c, int af, union epoll_ref ref) pname, strerror(errno)); return; } + if (sr.sa.sa_family != af) + goto unexpected; if (af == AF_INET) { struct icmphdr *ih4 = (struct icmphdr *)buf; + if ((size_t)n < sizeof(*ih4) || ih4->type != ICMP_ECHOREPLY) + goto unexpected; + /* Adjust packet back to guest-side ID */ ih4->un.echo.id = htons(ref.icmp.id); seq = ntohs(ih4->un.echo.sequence); } else if (af == AF_INET6) { struct icmp6hdr *ih6 = (struct icmp6hdr *)buf; + if ((size_t)n < sizeof(*ih6) || + ih6->icmp6_type != ICMPV6_ECHO_REPLY) + goto unexpected; + /* Adjust packet back to guest-side ID */ ih6->icmp6_identifier = htons(ref.icmp.id); seq = ntohs(ih6->icmp6_sequence); @@ -118,6 +127,10 @@ void icmp_sock_handler(const struct ctx *c, int af, union epoll_ref ref) else if (af == AF_INET6) tap_icmp6_send(c, &sr.sa6.sin6_addr, tap_ip6_daddr(c, &sr.sa6.sin6_addr), buf, n); + return; + +unexpected: + warn("%s: Unexpected packet on ping socket", pname); } /** -- 2.43.0
ICMP sockets are cleaned up on a timeout implemented in icmp_timer_one(), and the logic to do that cleanup is open coded in that function. Similarly new sockets are opened when we discover we don't have an existing one in icmp_tap_handler(), and again the logic is open-coded. That's not the worst thing, but it's a bit cleaner to have dedicated functions for the creation and destruction of ping sockets. This will also make things a bit easier for future changes we have in mind. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- icmp.c | 102 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 67 insertions(+), 35 deletions(-) diff --git a/icmp.c b/icmp.c index a9dc436..9434fc5 100644 --- a/icmp.c +++ b/icmp.c @@ -133,6 +133,70 @@ unexpected: warn("%s: Unexpected packet on ping socket", pname); } +/** + * icmp_ping_close() - Close and clean up a ping socket + * @c: Execution context + * @id_sock: Socket number and other info + */ +static void icmp_ping_close(const struct ctx *c, struct icmp_id_sock *id_sock) +{ + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_sock->sock, NULL); + close(id_sock->sock); + id_sock->sock = -1; + id_sock->seq = -1; +} + +/** + * icmp_ping_new() - Prepare a new ping socket for a new id + * @c: Execution context + * @id_sock: Socket fd and other information + * @af: Address family, AF_INET or AF_INET6 + * @id: ICMP id for the new socket + * + * Return: Newly opened ping socket fd, or -1 on failure + */ +static int icmp_ping_new(const struct ctx *c, struct icmp_id_sock *id_sock, + int af, uint16_t id) +{ + uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6; + const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; + union icmp_epoll_ref iref = { .id = id }; + const void *bind_addr; + const char *bind_if; + int s; + + if (af == AF_INET) { + bind_addr = &c->ip4.addr_out; + bind_if = c->ip4.ifname_out; + } else { + bind_addr = &c->ip6.addr_out; + bind_if = c->ip6.ifname_out; + } + + s = sock_l4(c, af, proto, bind_addr, bind_if, 0, iref.u32); + + if (s < 0) { + warn("Cannot open \"ping\" socket. You might need to:"); + warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\""); + warn("...echo requests/replies will fail."); + goto cancel; + } + + if (s > FD_REF_MAX) + goto cancel; + + id_sock->sock = s; + + debug("%s: new socket %i for echo ID %"PRIu16, pname, s, id); + + return s; + +cancel: + if (s >= 0) + close(s); + return -1; +} + /** * icmp_tap_handler() - Handle packets from tap * @c: Execution context @@ -149,7 +213,6 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, const void *saddr, const void *daddr, const struct pool *p, const struct timespec *now) { - uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6; const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6"; union { struct sockaddr sa; @@ -203,37 +266,9 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, int af, ASSERT(0); } - if ((s = id_sock->sock) < 0) { - union icmp_epoll_ref iref = { .id = id }; - const void *bind_addr; - const char *bind_if; - - if (af == AF_INET) { - bind_addr = &c->ip4.addr_out; - bind_if = c->ip4.ifname_out; - } else { - bind_addr = &c->ip6.addr_out; - bind_if = c->ip6.ifname_out; - } - - s = sock_l4(c, af, proto, bind_addr, bind_if, 0, iref.u32); - - if (s < 0) { - warn("Cannot open \"ping\" socket. You might need to:"); - warn(" sysctl -w net.ipv4.ping_group_range=\"0 2147483647\""); - warn("...echo requests/replies will fail."); - return 1; - } - - if (s > FD_REF_MAX) { - close(s); + if ((s = id_sock->sock) < 0) + if ((s = icmp_ping_new(c, id_sock, af, id)) < 0) return 1; - } - - id_sock->sock = s; - - debug("%s: new socket %i for echo ID %"PRIu16, pname, s, id); - } id_sock->ts = now->tv_sec; @@ -260,10 +295,7 @@ static void icmp_timer_one(const struct ctx *c, struct icmp_id_sock *id_sock, if (id_sock->sock < 0 || now->tv_sec - id_sock->ts <= ICMP_ECHO_TIMEOUT) return; - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_sock->sock, NULL); - close(id_sock->sock); - id_sock->sock = -1; - id_sock->seq = -1; + icmp_ping_close(c, id_sock); } /** -- 2.43.0
On Tue, 16 Jan 2024 16:16:07 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:As with TCP, it turns out that there are a bunch of clean ups and reworks to the ICMP code which will make integration with the flow table easier, even before introducing a non-trivial version of the flow table itself. Based on the flow based dispatch/allocation, and bind/addressing cleanup series.Applied. -- Stefano