+/** + * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer + * @c: Execution context + * @ee: Extended error descriptor + * @ref: epoll reference + * @in: First bytes (max 8) of original UDP message body + * @dlen: Length of the read part of original UDP message body + */ +static void udp_send_conn_fail_icmp4(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + void *in, size_t dlen) +{ + struct in_addr oaddr = toside->oaddr.v4mapped.a4; + struct in_addr eaddr = toside->eaddr.v4mapped.a4; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + struct { + struct icmphdr icmp4h; + struct iphdr ip4h; + struct udphdr uh; + char data[ICMP4_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + + ASSERT(dlen <= ICMP4_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp4h.type = ee->ee_type; + msg.icmp4h.code = ee->ee_code; + if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED) + msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, dlen, IPPROTO_UDP); + tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + tap_icmp4_send(c, oaddr, eaddr, &msg, msglen); +}The destination IP of the origin packet might not be the source IP of an ICMP error message, if a router sent this ICMP error message. Increase local MTU and try this program: ``` #packet-too-big.py #ip link set eth0 mtu 1520 from socket import * import time IP_RECVERR=0xb IP_MTU_DISCOVER=0xa IP_PMTUDISC_PROBE=0x3 with socket(AF_INET,SOCK_DGRAM,IPPROTO_UDP) as sock: sock.setsockopt(IPPROTO_IP,IP_RECVERR,1) sock.setsockopt(IPPROTO_IP,IP_MTU_DISCOVER,IP_PMTUDISC_PROBE) bigPacket=bytes(1480) sock.sendto(bigPacket,("151.101.1.6",443)) time.sleep(0.1) print(sock.recvmsg(1480,1024,MSG_ERRQUEUE)) ```if (ref.type == EPOLL_TYPE_UDP_REPLY) { flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(sidx); - - udp_send_conn_fail_icmp4(c, ee, toside, data, rc); + size_t dlen = rc; + + if (hdr->cmsg_level == IPPROTO_IP) { + dlen = MIN(dlen, ICMP4_MAX_DLEN); + udp_send_conn_fail_icmp4(c, ee, toside, data, dlen); + } else if (hdr->cmsg_level == IPPROTO_IPV6) { + udp_send_conn_fail_icmp6(c, ee, toside, data, + dlen, sidx.flowi); + } } else { trace("Ignoring received IP_RECVERR cmsg on listener socket"); }If the socket is dual-stack, cmsg_level may not match cmsg_data. ``` #dual-stack-test.py from socket import * import time IP_RECVERR=0xb with socket(AF_INET6,SOCK_DGRAM,IPPROTO_UDP) as sock: sock.setsockopt(IPPROTO_IP,IP_RECVERR,1) sock.setsockopt(IPPROTO_IP,IP_TTL,1) packet=bytes(8) sock.sendto(packet,("::ffff:151.101.1.6",443)) time.sleep(0.1) print(sock.recvmsg(1472,1024,MSG_ERRQUEUE)) ```
On 2025-03-03 07:07, 7ppKb5bW wrote:You are right. The original scope of this series was only to handle ICMP_PORT_UNREACH/ICMP6_DST_UNREACH_NOPORT messages, but now that we inlcude more ICMP types it becomes different, of course. This is easy to fix, though, so I will post a new version where I do that.+/** + * udp_send_conn_fail_icmp4() - Construct and send ICMPv4 to local peer + * @c: Execution context + * @ee: Extended error descriptor + * @ref: epoll reference + * @in: First bytes (max 8) of original UDP message body + * @dlen: Length of the read part of original UDP message body + */ +static void udp_send_conn_fail_icmp4(const struct ctx *c, + const struct sock_extended_err *ee, + const struct flowside *toside, + void *in, size_t dlen) +{ + struct in_addr oaddr = toside->oaddr.v4mapped.a4; + struct in_addr eaddr = toside->eaddr.v4mapped.a4; + in_port_t eport = toside->eport; + in_port_t oport = toside->oport; + struct { + struct icmphdr icmp4h; + struct iphdr ip4h; + struct udphdr uh; + char data[ICMP4_MAX_DLEN]; + } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg; + size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen; + + ASSERT(dlen <= ICMP4_MAX_DLEN); + memset(&msg, 0, sizeof(msg)); + msg.icmp4h.type = ee->ee_type; + msg.icmp4h.code = ee->ee_code; + if (ee->ee_type == ICMP_DEST_UNREACH && ee->ee_code == ICMP_FRAG_NEEDED) + msg.icmp4h.un.frag.mtu = htons((uint16_t) ee->ee_info); + + /* Reconstruct the original headers as returned in the ICMP message */ + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, dlen, IPPROTO_UDP); + tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen); + memcpy(&msg.data, in, dlen); + + tap_icmp4_send(c, oaddr, eaddr, &msg, msglen); +}The destination IP of the origin packet might not be the source IP of an ICMP error message, if a router sent this ICMP error message. Increase local MTU and try this program: ``` #packet-too-big.py #ip link set eth0 mtu 1520 from socket import * import time IP_RECVERR=0xb IP_MTU_DISCOVER=0xa IP_PMTUDISC_PROBE=0x3 with socket(AF_INET,SOCK_DGRAM,IPPROTO_UDP) as sock: sock.setsockopt(IPPROTO_IP,IP_RECVERR,1) sock.setsockopt(IPPROTO_IP,IP_MTU_DISCOVER,IP_PMTUDISC_PROBE) bigPacket=bytes(1480) sock.sendto(bigPacket,("151.101.1.6",443)) time.sleep(0.1) print(sock.recvmsg(1480,1024,MSG_ERRQUEUE)) ```Yes, this was mentioned at some point during our discussions, and we should eventually handle it, but it is really outside the scope of the current series. ///jonif (ref.type == EPOLL_TYPE_UDP_REPLY) { flow_sidx_t sidx = flow_sidx_opposite(ref.flowside); const struct flowside *toside = flowside_at_sidx(sidx); - - udp_send_conn_fail_icmp4(c, ee, toside, data, rc); + size_t dlen = rc; + + if (hdr->cmsg_level == IPPROTO_IP) { + dlen = MIN(dlen, ICMP4_MAX_DLEN); + udp_send_conn_fail_icmp4(c, ee, toside, data, dlen); + } else if (hdr->cmsg_level == IPPROTO_IPV6) { + udp_send_conn_fail_icmp6(c, ee, toside, data, + dlen, sidx.flowi); + } } else { trace("Ignoring received IP_RECVERR cmsg on listener socket"); }If the socket is dual-stack, cmsg_level may not match cmsg_data. ``` #dual-stack-test.py from socket import * import time IP_RECVERR=0xb with socket(AF_INET6,SOCK_DGRAM,IPPROTO_UDP) as sock: sock.setsockopt(IPPROTO_IP,IP_RECVERR,1) sock.setsockopt(IPPROTO_IP,IP_TTL,1) packet=bytes(8) sock.sendto(packet,("::ffff:151.101.1.6",443)) time.sleep(0.1) print(sock.recvmsg(1472,1024,MSG_ERRQUEUE)) ```
On Mon, Mar 03, 2025 at 11:41:48AM -0500, Jon Maloy wrote:On 2025-03-03 07:07, 7ppKb5bW wrote:[snip]In fact, I think the point our mystery person is making here is subtly different from the v4 vs v6 thing I raised earlier in the discussion. I was talking about possible future cases where we might be using a different IP version on the host than we're using for the guest (e.g. CLAT). For that we'd need to pick our generated ICMP version based on the addresses in the flow - and rather more complicated we'd need to be prepared to do translation between ICMPv4 and ICMPv6 errors. The point that 7ppKb5bW is raising here, is that a dual-stack socket (mostly) counts as an IPv6 socket and will show the cmsg_level accordingly, even though it can also receive ICMPv4 errors. I don't think it's urgent to fix this before merging the series, because we're only handling errors on reply sockets for now, and we don't use dual-stack sockets for those. We should fix it as a follow up, though, which I believe we can do fairly easily by looking at ee_origin, instead of cmsg_level. -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibsonYes, this was mentioned at some point during our discussions, and we should eventually handle it, but it is really outside the scope of the current series.- udp_send_conn_fail_icmp4(c, ee, toside, data, rc); + size_t dlen = rc; + + if (hdr->cmsg_level == IPPROTO_IP) { + dlen = MIN(dlen, ICMP4_MAX_DLEN); + udp_send_conn_fail_icmp4(c, ee, toside, data, dlen); + } else if (hdr->cmsg_level == IPPROTO_IPV6) { + udp_send_conn_fail_icmp6(c, ee, toside, data, + dlen, sidx.flowi); + } } else { trace("Ignoring received IP_RECVERR cmsg on listener socket"); }If the socket is dual-stack, cmsg_level may not match cmsg_data. ``` #dual-stack-test.py from socket import * import time IP_RECVERR=0xb with socket(AF_INET6,SOCK_DGRAM,IPPROTO_UDP) as sock: sock.setsockopt(IPPROTO_IP,IP_RECVERR,1) sock.setsockopt(IPPROTO_IP,IP_TTL,1) packet=bytes(8) sock.sendto(packet,("::ffff:151.101.1.6",443)) time.sleep(0.1) print(sock.recvmsg(1472,1024,MSG_ERRQUEUE)) ```