[PATCH 0/9] Improve handling of MTU limits

newer
[PATCH v2 0/9] Improve handling of...

David Gibson

11 Mar 2025 11 Mar '25

7:03 a.m.

After considerable lead up, this corrects the handling of the --mtu option so that it will respect limits imposed by the backend tap layers, correctly accounting for L2 headers. This incorporates the earlier mode setting patches, on which the rest depend. There's been no change in those patches, there just included here for self-containedness. David Gibson (9): conf: Use the same optstring for passt and pasta modes conf: Move mode detection into helper function conf: Detect vhost-user mode earlier packet: Give explicit name to maximum packet size packet: Remove redundant TAP_BUF_BYTES define tap: Use explicit defines for maximum length of L2 frame Simplify sizing of pkt_buf pcap: Correctly set snaplen based on tap backend type conf: Limit maximum MTU based on backend frame size conf.c | 98 +++++++++++++++++++++++++++++++++++++++++--------------- conf.h | 1 + packet.c | 4 +-- packet.h | 3 ++ passt.c | 16 ++------- passt.h | 7 ++-- pcap.c | 46 +++++++++++++------------- tap.c | 39 +++++++++++++++++++--- tap.h | 26 +++++++++++++++ util.h | 3 -- 10 files changed, 168 insertions(+), 75 deletions(-) -- 2.48.1

Show replies by date

David Gibson

11 Mar 11 Mar

7:03 a.m.

New subject: [PATCH 1/9] conf: Use the same optstring for passt and pasta modes

Currently we rely on detecting our mode first and use different sets of (single character) options for each. This means that if you use an option valid in only one mode in another you'll get the generic usage() message. We can give more helpful errors with little extra effort by combining all the options into a single value of the option string and giving bespoke messages if an option for the wrong mode is used; in fact we already did this for some single mode options like '-1'. Signed-off-by: David Gibson --- conf.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/conf.c b/conf.c index 065e7201..7f20bc84 100644 --- a/conf.c +++ b/conf.c @@ -1388,6 +1388,7 @@ void conf(struct ctx *c, int argc, char **argv) {"repair-path", required_argument, NULL, 28 }, { 0 }, }; + const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:"; const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; bool copy_addrs_opt = false, copy_routes_opt = false; @@ -1397,7 +1398,6 @@ void conf(struct ctx *c, int argc, char **argv) struct fqdn *dnss = c->dns_search; unsigned int ifi4 = 0, ifi6 = 0; const char *logfile = NULL; - const char *optstring; size_t logsize = 0; char *runas = NULL; long fd_tap_opt; @@ -1408,9 +1408,6 @@ void conf(struct ctx *c, int argc, char **argv) if (c->mode == MODE_PASTA) { c->no_dhcp_dns = c->no_dhcp_dns_search = 1; fwd_default = FWD_AUTO; - optstring = "+dqfel:hF:I:p:P:m:a:n:M:g:i:o:D:S:H:46t:u:T:U:"; - } else { - optstring = "+dqfel:hs:F:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:"; } c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t)); @@ -1614,6 +1611,9 @@ void conf(struct ctx *c, int argc, char **argv) c->foreground = 1; break; case 's': + if (c->mode == MODE_PASTA) + die("-s is for passt / vhost-user mode only"); + ret = snprintf(c->sock_path, sizeof(c->sock_path), "%s", optarg); if (ret <= 0 || ret >= (int)sizeof(c->sock_path)) @@ -1634,6 +1634,9 @@ void conf(struct ctx *c, int argc, char **argv) *c->sock_path = 0; break; case 'I': + if (c->mode != MODE_PASTA) + die("-I is for pasta mode only"); + ret = snprintf(c->pasta_ifn, IFNAMSIZ, "%s", optarg); if (ret <= 0 || ret >= IFNAMSIZ) @@ -1790,11 +1793,16 @@ void conf(struct ctx *c, int argc, char **argv) break; case 't': case 'u': - case 'T': - case 'U': case 'D': /* Handle these later, once addresses are configured */ break; + case 'T': + case 'U': + if (c->mode != MODE_PASTA) + die("-%c is for pasta mode only", name); + + /* Handle properly later, once addresses are configured */ + break; case 'h': usage(argv[0], stdout, EXIT_SUCCESS); break; -- 2.48.1

David Gibson

7:03 a.m.

New subject: [PATCH 2/9] conf: Move mode detection into helper function

One of the first things we need to do is determine if we're in passt mode or pasta mode. Currently this is open-coded in main(), by examining argv[0]. We want to complexify this a bit in future to cover vhost-user mode as well. Prepare for this, by moving the mode detection into a new conf_mode() function. Signed-off-by: David Gibson --- conf.c | 26 ++++++++++++++++++++++++++ conf.h | 1 + passt.c | 14 ++------------ 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/conf.c b/conf.c index 7f20bc84..2022ea1d 100644 --- a/conf.c +++ b/conf.c @@ -991,6 +991,32 @@ pasta_opts: _exit(status); } +/** + * conf_mode() - Determine passt/pasta's operating mode from command line + * @argc: Argument count + * @argv: Command line arguments + * + * Return: mode to operate in, PASTA or PASST + */ +/* cppcheck-suppress constParameter */ +enum passt_modes conf_mode(int argc, char *argv[]) +{ + char argv0[PATH_MAX], *basearg0; + + if (argc < 1) + die("Cannot determine argv[0]"); + + strncpy(argv0, argv[0], PATH_MAX - 1); + basearg0 = basename(argv0); + if (strstr(basearg0, "pasta")) + return MODE_PASTA; + + if (strstr(basearg0, "passt")) + return MODE_PASST; + + die("Cannot determine mode, invoke as \"passt\" or \"pasta\""); +} + /** * conf_print() - Print fundamental configuration parameters * @c: Execution context diff --git a/conf.h b/conf.h index 9d2143db..b45ad746 100644 --- a/conf.h +++ b/conf.h @@ -6,6 +6,7 @@ #ifndef CONF_H #define CONF_H +enum passt_modes conf_mode(int argc, char *argv[]); void conf(struct ctx *c, int argc, char **argv); #endif /* CONF_H */ diff --git a/passt.c b/passt.c index 868842b0..0bd2a292 100644 --- a/passt.c +++ b/passt.c @@ -191,7 +191,6 @@ int main(int argc, char **argv) { struct epoll_event events[EPOLL_EVENTS]; int nfds, i, devnull_fd = -1; - char argv0[PATH_MAX], *name; struct ctx c = { 0 }; struct rlimit limit; struct timespec now; @@ -213,21 +212,12 @@ int main(int argc, char **argv) sigaction(SIGTERM, &sa, NULL); sigaction(SIGQUIT, &sa, NULL); - if (argc < 1) - _exit(EXIT_FAILURE); + c.mode = conf_mode(argc, argv); - strncpy(argv0, argv[0], PATH_MAX - 1); - name = basename(argv0); - if (strstr(name, "pasta")) { + if (c.mode == MODE_PASTA) { sa.sa_handler = pasta_child_handler; if (sigaction(SIGCHLD, &sa, NULL)) die_perror("Couldn't install signal handlers"); - - c.mode = MODE_PASTA; - } else if (strstr(name, "passt")) { - c.mode = MODE_PASST; - } else { - _exit(EXIT_FAILURE); } if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) -- 2.48.1

David Gibson

7:03 a.m.

New subject: [PATCH 3/9] conf: Detect vhost-user mode earlier

We detect our operating mode in conf_mode(), unless we're using vhost-user mode, in which case we change it later when we parse the --vhost-user option. That means we need to delay parsing the --repair-path option (for vhost-user only) until still later. However, there are many other places in the main option parsing loop which also rely on mode. We get away with those, because they happen to be able to treat passt and vhost-user modes identically. This is potentially confusing, though. So, move setting of MODE_VU into conf_mode() so c->mode always has its final value from that point onwards. To match, we move the parsing of --repair-path back into the main option parsing loop. Signed-off-by: David Gibson --- conf.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/conf.c b/conf.c index 2022ea1d..b58e2a6e 100644 --- a/conf.c +++ b/conf.c @@ -998,10 +998,23 @@ pasta_opts: * * Return: mode to operate in, PASTA or PASST */ -/* cppcheck-suppress constParameter */ enum passt_modes conf_mode(int argc, char *argv[]) { + int vhost_user = 0; + const struct option optvu[] = { + {"vhost-user", no_argument, &vhost_user, 1 }, + { 0 }, + }; char argv0[PATH_MAX], *basearg0; + int name; + + optind = 0; + do { + name = getopt_long(argc, argv, "-:", optvu, NULL); + } while (name != -1); + + if (vhost_user) + return MODE_VU; if (argc < 1) die("Cannot determine argv[0]"); @@ -1604,9 +1617,8 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid host nameserver address: %s", optarg); case 25: - if (c->mode == MODE_PASTA) - die("--vhost-user is for passt mode only"); - c->mode = MODE_VU; + /* Already handled in conf_mode() */ + ASSERT(c->mode == MODE_VU); break; case 26: vu_print_capabilities(); @@ -1617,7 +1629,14 @@ void conf(struct ctx *c, int argc, char **argv) die("Invalid FQDN: %s", optarg); break; case 28: - /* Handle this once we checked --vhost-user */ + if (c->mode != MODE_VU && strcmp(optarg, "none")) + die("--repair-path is for vhost-user mode only"); + + if (snprintf_check(c->repair_path, + sizeof(c->repair_path), "%s", + optarg)) + die("Invalid passt-repair path: %s", optarg); + break; case 'd': c->debug = 1; @@ -1917,8 +1936,8 @@ void conf(struct ctx *c, int argc, char **argv) if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw)) c->no_dhcp = 1; - /* Inbound port options, DNS, and --repair-path can be parsed now, after - * IPv4/IPv6 settings and --vhost-user. + /* Inbound port options and DNS can be parsed now, after IPv4/IPv6 + * settings */ fwd_probe_ephemeral(); udp_portmap_clear(); @@ -1964,16 +1983,6 @@ void conf(struct ctx *c, int argc, char **argv) } die("Cannot use DNS address %s", optarg); - } else if (name == 28) { - if (c->mode != MODE_VU && strcmp(optarg, "none")) - die("--repair-path is for vhost-user mode only"); - - if (snprintf_check(c->repair_path, - sizeof(c->repair_path), "%s", - optarg)) - die("Invalid passt-repair path: %s", optarg); - - break; } } while (name != -1); -- 2.48.1

Stefano Brivio

11:45 p.m.

New subject: [PATCH 3/9] conf: Detect vhost-user mode earlier

On Tue, 11 Mar 2025 17:03:12 +1100 David Gibson wrote:

...

We detect our operating mode in conf_mode(), unless we're using vhost-user mode, in which case we change it later when we parse the --vhost-user option. That means we need to delay parsing the --repair-path option (for vhost-user only) until still later.

However, there are many other places in the main option parsing loop which also rely on mode. We get away with those, because they happen to be able to treat passt and vhost-user modes identically. This is potentially confusing, though. So, move setting of MODE_VU into conf_mode() so c->mode always has its final value from that point onwards.

To match, we move the parsing of --repair-path back into the main option parsing loop.

Signed-off-by: David Gibson --- conf.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/conf.c b/conf.c index 2022ea1d..b58e2a6e 100644 --- a/conf.c +++ b/conf.c @@ -998,10 +998,23 @@ pasta_opts: * * Return: mode to operate in, PASTA or PASST */ -/* cppcheck-suppress constParameter */ enum passt_modes conf_mode(int argc, char *argv[]) { + int vhost_user = 0; + const struct option optvu[] = { + {"vhost-user", no_argument, &vhost_user, 1 }, + { 0 }, + }; char argv0[PATH_MAX], *basearg0; + int name; + + optind = 0; + do { + name = getopt_long(argc, argv, "-:", optvu, NULL); + } while (name != -1); + + if (vhost_user) + return MODE_VU;

if (argc < 1) die("Cannot determine argv[0]"); @@ -1604,9 +1617,8 @@ void conf(struct ctx *c, int argc, char **argv)

die("Invalid host nameserver address: %s", optarg); case 25: - if (c->mode == MODE_PASTA) - die("--vhost-user is for passt mode only");

This check should now be moved to conf_mode() instead of being dropped, otherwise you can do: $ ./pasta -f --vhost-user and at this point, the mode is MODE_VU, so it's all fine, but I don't think it's intended (...or is it?).

...

- c->mode = MODE_VU; + /* Already handled in conf_mode() */ + ASSERT(c->mode == MODE_VU); break; case 26: vu_print_capabilities();

Pre-existing, but now we can fix this: case 26 (--print-capabilities) should only be accepted if (c->mode == MODE_VU). It can also be done in another patch I would say, if you don't want to re-spin this. -- Stefano

David Gibson

12 Mar 12 Mar

1:48 a.m.

New subject: [PATCH 3/9] conf: Detect vhost-user mode earlier

On Tue, Mar 11, 2025 at 11:45:03PM +0100, Stefano Brivio wrote:

...

On Tue, 11 Mar 2025 17:03:12 +1100 David Gibson wrote:

...
We detect our operating mode in conf_mode(), unless we're using vhost-user mode, in which case we change it later when we parse the --vhost-user option. That means we need to delay parsing the --repair-path option (for vhost-user only) until still later.

However, there are many other places in the main option parsing loop which also rely on mode. We get away with those, because they happen to be able to treat passt and vhost-user modes identically. This is potentially confusing, though. So, move setting of MODE_VU into conf_mode() so c->mode always has its final value from that point onwards.

To match, we move the parsing of --repair-path back into the main option parsing loop.

Signed-off-by: David Gibson --- conf.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/conf.c b/conf.c index 2022ea1d..b58e2a6e 100644 --- a/conf.c +++ b/conf.c @@ -998,10 +998,23 @@ pasta_opts: * * Return: mode to operate in, PASTA or PASST */ -/* cppcheck-suppress constParameter */ enum passt_modes conf_mode(int argc, char *argv[]) { + int vhost_user = 0; + const struct option optvu[] = { + {"vhost-user", no_argument, &vhost_user, 1 }, + { 0 }, + }; char argv0[PATH_MAX], *basearg0; + int name; + + optind = 0; + do { + name = getopt_long(argc, argv, "-:", optvu, NULL); + } while (name != -1); + + if (vhost_user) + return MODE_VU;

if (argc < 1) die("Cannot determine argv[0]"); @@ -1604,9 +1617,8 @@ void conf(struct ctx *c, int argc, char **argv)

die("Invalid host nameserver address: %s", optarg); case 25: - if (c->mode == MODE_PASTA) - die("--vhost-user is for passt mode only");

This check should now be moved to conf_mode() instead of being dropped, otherwise you can do:

$ ./pasta -f --vhost-user

and at this point, the mode is MODE_VU, so it's all fine, but I don't think it's intended (...or is it?).

It's more or less intended. To me it seemed simpler to treat "vhost-user mode" as co-equal with "/dev/net/tun mode" (pasta) or "qemu -net stream mode" (passt), rather than having vu be sort of a sub-mode of passt. It's true that vu mode has slightly more in common with passt mode than pasta at the moment, but I don't see that as really inherent. I also saw this as a precursor to a "--mode whatever" option which would override the mode regardless of argv[0], in case there are circumstances where manipulating argv[0] is inconvenient. But if you'd really prefer I can reinstate the check.

...

...
- c->mode = MODE_VU; + /* Already handled in conf_mode() */ + ASSERT(c->mode == MODE_VU); break; case 26: vu_print_capabilities();

Pre-existing, but now we can fix this: case 26 (--print-capabilities) should only be accepted if (c->mode == MODE_VU).

I was unsure about this, because I wasn't certain if --vhost-user was passed when we were invoked just to probe capabilities.

...

It can also be done in another patch I would say, if you don't want to re-spin this.

-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson

David Gibson

11 Mar 11 Mar

7:03 a.m.

New subject: [PATCH 4/9] packet: Give explicit name to maximum packet size

We verify that every packet we store in a pool (and every partial packet we retreive from it) has a length no longer than UINT16_MAX. This originated in the older packet pool implementation which stored packet lengths in a uint16_t. Now, that packets are represented by a struct iovec with its size_t length, this check serves only as a sanity / security check that we don't have some wildly out of range length due to a bug elsewhere. We have may reasons to (slightly) increase this limit in future, so in preparation, give this quantity an explicit name - PACKET_MAX_LEN. Signed-off-by: David Gibson --- packet.c | 4 ++-- packet.h | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/packet.c b/packet.c index 0330b548..bcac0375 100644 --- a/packet.c +++ b/packet.c @@ -83,7 +83,7 @@ void packet_add_do(struct pool *p, size_t len, const char *start, if (packet_check_range(p, start, len, func, line)) return; - if (len > UINT16_MAX) { + if (len > PACKET_MAX_LEN) { trace("add packet length %zu, %s:%i", len, func, line); return; } @@ -119,7 +119,7 @@ void *packet_get_do(const struct pool *p, size_t idx, size_t offset, return NULL; } - if (len > UINT16_MAX) { + if (len > PACKET_MAX_LEN) { if (func) { trace("packet data length %zu, %s:%i", len, func, line); diff --git a/packet.h b/packet.h index bdc07fef..d099f026 100644 --- a/packet.h +++ b/packet.h @@ -6,6 +6,9 @@ #ifndef PACKET_H #define PACKET_H +/* Maximum size of a single packet stored in pool, including headers */ +#define PACKET_MAX_LEN UINT16_MAX + /** * struct pool - Generic pool of packets stored in a buffer * @buf: Buffer storing packet descriptors, -- 2.48.1

David Gibson

7:03 a.m.

New subject: [PATCH 5/9] packet: Remove redundant TAP_BUF_BYTES define

Currently we define both TAP_BUF_BYTES and PKT_BUF_BYTES as essentially the same thing. They'll be different only if TAP_BUF_BYTES is negative, which makes no sense. So, remove TAP_BUF_BYTES and just use PKT_BUF_BYTES. In addition, most places we use this to just mean the size of the main packet buffer (pkt_buf) for which we can just directly use sizeof. Signed-off-by: David Gibson --- passt.c | 2 +- passt.h | 5 ++--- tap.c | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/passt.c b/passt.c index 0bd2a292..cd067728 100644 --- a/passt.c +++ b/passt.c @@ -223,7 +223,7 @@ int main(int argc, char **argv) if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) die_perror("Couldn't set disposition for SIGPIPE"); - madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE); + madvise(pkt_buf, sizeof(pkt_buf), MADV_HUGEPAGE); c.epollfd = epoll_create1(EPOLL_CLOEXEC); if (c.epollfd == -1) diff --git a/passt.h b/passt.h index 28d13892..6b248051 100644 --- a/passt.h +++ b/passt.h @@ -69,12 +69,11 @@ union epoll_ref { static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), "epoll_ref must have same size as epoll_data"); -#define TAP_BUF_BYTES \ +#define PKT_BUF_BYTES \ ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE) #define TAP_MSGS \ - DIV_ROUND_UP(TAP_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) + DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) -#define PKT_BUF_BYTES MAX(TAP_BUF_BYTES, 0) extern char pkt_buf [PKT_BUF_BYTES]; extern char *epoll_type_str[]; diff --git a/tap.c b/tap.c index 4541f51d..fb306e75 100644 --- a/tap.c +++ b/tap.c @@ -1080,7 +1080,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) do { n = recv(c->fd_tap, pkt_buf + partial_len, - TAP_BUF_BYTES - partial_len, MSG_DONTWAIT); + sizeof(pkt_buf) - partial_len, MSG_DONTWAIT); } while ((n < 0) && errno == EINTR); if (n < 0) { @@ -1151,7 +1151,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) tap_flush_pools(); - for (n = 0; n <= (ssize_t)(TAP_BUF_BYTES - ETH_MAX_MTU); n += len) { + for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) { len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); if (len == 0) { -- 2.48.1

David Gibson

7:03 a.m.

New subject: [PATCH 6/9] tap: Use explicit defines for maximum length of L2 frame

Currently in tap.c we (mostly) use ETH_MAX_MTU as the maximum length of an L2 frame. This define comes from the kernel, but it's badly named and used confusingly. First, it doesn't really have anything to do with Ethernet, which has no structural limit on frame lengths. It comes more from either a) IP which imposes a 64k datagram limit or b) from internal buffers used in various places in the kernel (and in passt). Worse, MTU generally means the maximum size of the IP (L3) datagram which may be transferred, _not_ counting the L2 headers. In the kernel ETH_MAX_MTU is sometimes used that way, but sometimes seems to be used as a maximum frame length, _including_ L2 headers. In tap.c we're mostly using it in the second way. Finally, each of our tap backends could have different limits on the frame size imposed by the mechanisms they're using. Start clearing up this confusion by replacing it in tap.c with new L2_MAX_LEN_* defines which specifically refer to the maximum L2 frame length for each backend. Signed-off-by: David Gibson --- tap.c | 18 ++++++++++++++---- tap.h | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/tap.c b/tap.c index fb306e75..4840dcfa 100644 --- a/tap.c +++ b/tap.c @@ -62,6 +62,15 @@ #include "vhost_user.h" #include "vu_common.h" +/* Maximum allowed frame lengths (including L2 header) */ + +static_assert(L2_MAX_LEN_PASTA <= PACKET_MAX_LEN, + "packet pool can't store maximum size pasta frame"); +static_assert(L2_MAX_LEN_PASST <= PACKET_MAX_LEN, + "packet pool can't store maximum size qemu socket frame"); +static_assert(L2_MAX_LEN_VU <= PACKET_MAX_LEN, + "packet pool can't store maximum size vhost-user frame"); + /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); @@ -1097,7 +1106,8 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) while (n >= (ssize_t)sizeof(uint32_t)) { uint32_t l2len = ntohl_unaligned(p); - if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) { + if (l2len < sizeof(struct ethhdr) || + l2len > L2_MAX_LEN_PASST) { err("Bad frame size from guest, resetting connection"); tap_sock_reset(c); return; @@ -1151,8 +1161,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) tap_flush_pools(); - for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) { - len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); + for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA); n += len) { + len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA); if (len == 0) { die("EOF on tap device, exiting"); @@ -1170,7 +1180,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) /* Ignore frames of bad length */ if (len < (ssize_t)sizeof(struct ethhdr) || - len > (ssize_t)ETH_MAX_MTU) + len > (ssize_t)L2_MAX_LEN_PASTA) continue; tap_add_packet(c, len, pkt_buf + n); diff --git a/tap.h b/tap.h index a2c3b87d..140e3305 100644 --- a/tap.h +++ b/tap.h @@ -6,6 +6,31 @@ #ifndef TAP_H #define TAP_H +/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header) + * + * The kernel tuntap device imposes a maximum frame size of 65535 including + * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode). + */ +#define L2_MAX_LEN_PASTA USHRT_MAX + +/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header) + * + * The only structural limit the Qemu socket protocol imposes on frames is + * (2^32-1) bytes, but that would be ludicrously long in practice. For now, + * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate + * limit with more precision. + */ +#define L2_MAX_LEN_PASST USHRT_MAX + +/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header) + * + * VU allows multiple buffers per frame, each of which can be quite large, so + * the inherent frame size limit is rather large. Much larger than is actually + * useful for IP. For now limit arbitrarily to 65535 bytes. FIXME: Work out an + * appropriate limit with more precision. + */ +#define L2_MAX_LEN_VU USHRT_MAX + struct udphdr; /** -- 2.48.1

Stefano Brivio

11:45 p.m.

New subject: [PATCH 6/9] tap: Use explicit defines for maximum length of L2 frame

Nits only (I can fix it all up on merge): On Tue, 11 Mar 2025 17:03:15 +1100 David Gibson wrote:

...

Currently in tap.c we (mostly) use ETH_MAX_MTU as the maximum length of an L2 frame. This define comes from the kernel, but it's badly named and used confusingly.

First, it doesn't really have anything to do with Ethernet, which has no structural limit on frame lengths. It comes more from either a) IP which imposes a 64k datagram limit or b) from internal buffers used in various places in the kernel (and in passt).

Worse, MTU generally means the maximum size of the IP (L3) datagram which may be transferred, _not_ counting the L2 headers. In the kernel ETH_MAX_MTU is sometimes used that way, but sometimes seems to be used as a maximum frame length, _including_ L2 headers. In tap.c we're mostly using it in the second way.

Finally, each of our tap backends could have different limits on the frame size imposed by the mechanisms they're using.

Start clearing up this confusion by replacing it in tap.c with new L2_MAX_LEN_* defines which specifically refer to the maximum L2 frame length for each backend.

Signed-off-by: David Gibson --- tap.c | 18 ++++++++++++++---- tap.h | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/tap.c b/tap.c index fb306e75..4840dcfa 100644 --- a/tap.c +++ b/tap.c @@ -62,6 +62,15 @@ #include "vhost_user.h" #include "vu_common.h"

+/* Maximum allowed frame lengths (including L2 header) */ + +static_assert(L2_MAX_LEN_PASTA <= PACKET_MAX_LEN, + "packet pool can't store maximum size pasta frame"); +static_assert(L2_MAX_LEN_PASST <= PACKET_MAX_LEN, + "packet pool can't store maximum size qemu socket frame"); +static_assert(L2_MAX_LEN_VU <= PACKET_MAX_LEN, + "packet pool can't store maximum size vhost-user frame"); + /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); @@ -1097,7 +1106,8 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) while (n >= (ssize_t)sizeof(uint32_t)) { uint32_t l2len = ntohl_unaligned(p);

- if (l2len < sizeof(struct ethhdr) || l2len > ETH_MAX_MTU) { + if (l2len < sizeof(struct ethhdr) || + l2len > L2_MAX_LEN_PASST) {

No need to wrap this.

...

err("Bad frame size from guest, resetting connection"); tap_sock_reset(c); return; @@ -1151,8 +1161,8 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)

tap_flush_pools();

- for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - ETH_MAX_MTU); n += len) { - len = read(c->fd_tap, pkt_buf + n, ETH_MAX_MTU); + for (n = 0; n <= (ssize_t)(sizeof(pkt_buf) - L2_MAX_LEN_PASTA); n += len) {

n += len should go on its own line now.

...

+ len = read(c->fd_tap, pkt_buf + n, L2_MAX_LEN_PASTA);

if (len == 0) { die("EOF on tap device, exiting"); @@ -1170,7 +1180,7 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now)

/* Ignore frames of bad length */ if (len < (ssize_t)sizeof(struct ethhdr) || - len > (ssize_t)ETH_MAX_MTU) + len > (ssize_t)L2_MAX_LEN_PASTA) continue;

tap_add_packet(c, len, pkt_buf + n); diff --git a/tap.h b/tap.h index a2c3b87d..140e3305 100644 --- a/tap.h +++ b/tap.h @@ -6,6 +6,31 @@ #ifndef TAP_H #define TAP_H

+/** L2_MAX_LEN_PASTA - Maximum frame length for pasta mode (with L2 header) + * + * The kernel tuntap device imposes a maximum frame size of 65535 including + * 'hard_header_len' (14 bytes for L2 Ethernet in the case of "tap" mode).

Extra whitespaces in indentation.

...

+ */ +#define L2_MAX_LEN_PASTA USHRT_MAX + +/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header) + * + * The only structural limit the Qemu socket protocol imposes on frames is

QEMU

...

+ * (2^32-1) bytes, but that would be ludicrously long in practice. For now, + * limit it somewhat arbitrarily to 65535 bytes. FIXME: Work out an appropriate + * limit with more precision. + */ +#define L2_MAX_LEN_PASST USHRT_MAX + +/** L2_MAX_LEN_VU - Maximum frame length for vhost-user mode (with L2 header) + * + * VU allows multiple buffers per frame, each of which can be quite large, so

vhost-user

...

+ * the inherent frame size limit is rather large. Much larger than is actually + * useful for IP. For now limit arbitrarily to 65535 bytes. FIXME: Work out an + * appropriate limit with more precision. + */ +#define L2_MAX_LEN_VU USHRT_MAX + struct udphdr;

/**

-- Stefano

David Gibson

12 Mar 12 Mar

1:56 a.m.

New subject: [PATCH 6/9] tap: Use explicit defines for maximum length of L2 frame

On Tue, Mar 11, 2025 at 11:45:09PM +0100, Stefano Brivio wrote:

...

Nits only (I can fix it all up on merge):

Actually, I spotted one other small change I'd like to make here, so I might as well respin.

...

On Tue, 11 Mar 2025 17:03:15 +1100 David Gibson wrote: [snip]

...
+ */ +#define L2_MAX_LEN_PASTA USHRT_MAX + +/** L2_MAX_LEN_PASST - Maximum frame length for passt mode (with L2 header) + * + * The only structural limit the Qemu socket protocol imposes on frames is

QEMU

For some reason, the standard style for capitalizing QEMU never sticks in my head. -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson

David Gibson

11 Mar 11 Mar

7:03 a.m.

New subject: [PATCH 7/9] Simplify sizing of pkt_buf

We define the size of pkt_buf as large enough to hold 128 maximum size packets. Well, approximately, since we round down to the page size. We don't have any specific reliance on how many packets can fit in the buffer, we just want it to be big enough to allow reasonable batching. The current definition relies on the confusingly named ETH_MAX_MTU and adds in sizeof(uint32_t) rather non-obviously for the pseudo-physical header used by the qemu socket (passt mode) protocol. Instead, just define it to be 8MiB, which is what that complex calculation works out to. Signed-off-by: David Gibson --- passt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/passt.h b/passt.h index 6b248051..8f450912 100644 --- a/passt.h +++ b/passt.h @@ -69,8 +69,8 @@ union epoll_ref { static_assert(sizeof(union epoll_ref) <= sizeof(union epoll_data), "epoll_ref must have same size as epoll_data"); -#define PKT_BUF_BYTES \ - ROUND_DOWN(((ETH_MAX_MTU + sizeof(uint32_t)) * 128), PAGE_SIZE) +/* Large enough for ~128 maximum size frames */ +#define PKT_BUF_BYTES (8UL << 20) #define TAP_MSGS \ DIV_ROUND_UP(PKT_BUF_BYTES, ETH_ZLEN - 2 * ETH_ALEN + sizeof(uint32_t)) -- 2.48.1

David Gibson

7:03 a.m.

New subject: [PATCH 8/9] pcap: Correctly set snaplen based on tap backend type

The pcap header includes a value indicating how much of each frame is captured. We always capture the entire frame, so we want to set this to the maximum possible frame size. Currently we do that by setting it to ETH_MAX_MTU, but that's a confusingly named constant which might not always be correct depending on the details of our tap backend. Instead add a tap_l2_max_len() function that explicitly returns the maximum frame size for the current mode and use that to set snaplen. While we're there, there's no particular need for the pcap header to be defined in a global; make it local to pcap_init() instead. Signed-off-by: David Gibson --- pcap.c | 46 ++++++++++++++++++++++++---------------------- tap.c | 19 +++++++++++++++++++ tap.h | 1 + 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/pcap.c b/pcap.c index 3d623cfe..e95aa6fe 100644 --- a/pcap.c +++ b/pcap.c @@ -33,33 +33,12 @@ #include "log.h" #include "pcap.h" #include "iov.h" +#include "tap.h" #define PCAP_VERSION_MINOR 4 static int pcap_fd = -1; -/* See pcap.h from libpcap, or pcap-savefile(5) */ -static const struct { - uint32_t magic; -#define PCAP_MAGIC 0xa1b2c3d4 - - uint16_t major; -#define PCAP_VERSION_MAJOR 2 - - uint16_t minor; -#define PCAP_VERSION_MINOR 4 - - int32_t thiszone; - uint32_t sigfigs; - uint32_t snaplen; - - uint32_t linktype; -#define PCAP_LINKTYPE_ETHERNET 1 -} pcap_hdr = { - PCAP_MAGIC, PCAP_VERSION_MAJOR, PCAP_VERSION_MINOR, 0, 0, ETH_MAX_MTU, - PCAP_LINKTYPE_ETHERNET -}; - struct pcap_pkthdr { uint32_t tv_sec; uint32_t tv_usec; @@ -162,6 +141,29 @@ void pcap_iov(const struct iovec *iov, size_t iovcnt, size_t offset) */ void pcap_init(struct ctx *c) { + /* See pcap.h from libpcap, or pcap-savefile(5) */ +#define PCAP_MAGIC 0xa1b2c3d4 +#define PCAP_VERSION_MAJOR 2 +#define PCAP_VERSION_MINOR 4 +#define PCAP_LINKTYPE_ETHERNET 1 + const struct { + uint32_t magic; + uint16_t major; + uint16_t minor; + + int32_t thiszone; + uint32_t sigfigs; + uint32_t snaplen; + + uint32_t linktype; + } pcap_hdr = { + .magic = PCAP_MAGIC, + .major = PCAP_VERSION_MAJOR, + .minor = PCAP_VERSION_MINOR, + .snaplen = tap_l2_max_len(c), + .linktype = PCAP_LINKTYPE_ETHERNET + }; + if (pcap_fd != -1) return; diff --git a/tap.c b/tap.c index 4840dcfa..7cace71a 100644 --- a/tap.c +++ b/tap.c @@ -78,6 +78,25 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ +/** + * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode + * @c: Execution context + */ +unsigned long tap_l2_max_len(const struct ctx *c) +{ + /* NOLINTBEGIN(bugprone-branch-clone): values can be the same */ + switch (c->mode) { + case MODE_PASST: + return L2_MAX_LEN_PASST; + case MODE_PASTA: + return L2_MAX_LEN_PASTA; + case MODE_VU: + return L2_MAX_LEN_VU; + } + /* NOLINTEND(bugprone-branch-clone) */ + ASSERT(0); +} + /** * tap_send_single() - Send a single frame * @c: Execution context diff --git a/tap.h b/tap.h index 140e3305..2d2221e2 100644 --- a/tap.h +++ b/tap.h @@ -69,6 +69,7 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +unsigned long tap_l2_max_len(const struct ctx *c); void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, struct in_addr dst, size_t l4len, uint8_t proto); -- 2.48.1

David Gibson

7:03 a.m.

New subject: [PATCH 9/9] conf: Limit maximum MTU based on backend frame size

The -m option controls the MTU, that is the maximum transmissible L3 datagram, not including L2 headers. We currently limit it to ETH_MAX_MTU which sounds like it makes sense. But ETH_MAX_MTU is confusing: it's not consistently used as to whether it means the maximum L3 datagram size or the maximum L2 frame size. Even within conf() we explicitly account for the L2 header size when computing the default --mtu value, but not when we compute the maximum --mtu value. Clean this up by reworking the maximum MTU computation to be the minimum of IP_MAX_MTU (65535) and the maximum sized IP datagram which can fit into our L2 frames when we account for the L2 header. The latter can vary depending on our tap backend, although it doesn't right now. Link: https://bugs.passt.top/show_bug.cgi?id=66 Signed-off-by: David Gibson --- conf.c | 11 +++++++---- util.h | 3 --- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/conf.c b/conf.c index b58e2a6e..c760f791 100644 --- a/conf.c +++ b/conf.c @@ -1434,6 +1434,7 @@ void conf(struct ctx *c, int argc, char **argv) enum fwd_ports_mode fwd_default = FWD_NONE; bool v4_only = false, v6_only = false; unsigned dns4_idx = 0, dns6_idx = 0; + unsigned long max_mtu = IP_MAX_MTU; struct fqdn *dnss = c->dns_search; unsigned int ifi4 = 0, ifi6 = 0; const char *logfile = NULL; @@ -1449,7 +1450,9 @@ void conf(struct ctx *c, int argc, char **argv) fwd_default = FWD_AUTO; } - c->mtu = ROUND_DOWN(ETH_MAX_MTU - ETH_HLEN, sizeof(uint32_t)); + if (tap_l2_max_len(c) - ETH_HLEN < max_mtu) + max_mtu = tap_l2_max_len(c) - ETH_HLEN; + c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t)); c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = FWD_UNSET; c->udp.fwd_in.mode = c->udp.fwd_out.mode = FWD_UNSET; memcpy(c->our_tap_mac, MAC_OUR_LAA, ETH_ALEN); @@ -1711,9 +1714,9 @@ void conf(struct ctx *c, int argc, char **argv) if (errno || *e) die("Invalid MTU: %s", optarg); - if (mtu > ETH_MAX_MTU) { - die("MTU %lu too large (max %u)", - mtu, ETH_MAX_MTU); + if (mtu > max_mtu) { + die("MTU %lu too large (max %lu)", + mtu, max_mtu); } c->mtu = mtu; diff --git a/util.h b/util.h index 0f70f4d4..4d512fab 100644 --- a/util.h +++ b/util.h @@ -31,9 +31,6 @@ #ifndef SECCOMP_RET_KILL_PROCESS #define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL #endif -#ifndef ETH_MAX_MTU -#define ETH_MAX_MTU USHRT_MAX -#endif #ifndef IP_MAX_MTU #define IP_MAX_MTU USHRT_MAX #endif -- 2.48.1

147

Age (days ago)

148

Last active (days ago)

List overview

Download

13 comments

2 participants

participants (2)

David Gibson
Stefano Brivio

[PATCH 0/9] Improve handling of MTU limits

tags

participants (2)