This series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end. This is based on Laurent's patch fixing pcap_multiple() not to capture frames we failed to send. David Gibson (4): tap: Extend tap_send_frames() to allow multi-buffer frames tap: Simplify some casts in the tap "slow path" functions tap: Implement tap_send() "slow path" in terms of fast path tap: Rename tap_iov_{base,len} arp.c | 4 +- tap.c | 158 +++++++++++++++++++++++++++++++--------------------------- tap.h | 19 +++---- tcp.c | 20 ++++---- udp.c | 10 ++-- 5 files changed, 111 insertions(+), 100 deletions(-) -- 2.44.0
tap_send_frames() takes a vector of buffers and requires exactly one frame per buffer. We have future plans where we want to have multiple buffers per frame in some circumstances, so extend tap_send_frames() to take the number of buffers per frame as a parameter. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 83 +++++++++++++++++++++++++++++++++++++---------------------- tap.h | 3 ++- tcp.c | 8 +++--- udp.c | 2 +- 4 files changed, 59 insertions(+), 37 deletions(-) diff --git a/tap.c b/tap.c index f4051cec..f9e2a8d9 100644 --- a/tap.c +++ b/tap.c @@ -309,21 +309,28 @@ void tap_icmp6_send(const struct ctx *c, /** * tap_send_frames_pasta() - Send multiple frames to the pasta tap - * @c: Execution context - * @iov: Array of buffers, each containing one frame - * @n: Number of buffers/frames in @iov + * @c: Execution context + * @iov: Array of buffers + * @bufs_per_frame: Number of buffers (iovec entries) per frame + * @nframes: Number of frames to send * + * @iov must have total length @bufs_per_frame * @nframes, with each set of + * @bufs_per_frame contiguous buffers representing a single frame. + * * Return: number of frames successfully sent * * #syscalls:pasta write */ static size_t tap_send_frames_pasta(const struct ctx *c, - const struct iovec *iov, size_t n) + const struct iovec *iov, + size_t bufs_per_frame, size_t nframes) { + size_t nbufs = bufs_per_frame * nframes; size_t i; - for (i = 0; i < n; i++) { - ssize_t rc = write(c->fd_tap, iov[i].iov_base, iov[i].iov_len); + for (i = 0; i < nbufs; i += bufs_per_frame) { + ssize_t rc = writev(c->fd_tap, iov + i, bufs_per_frame); + size_t framelen = iov_size(iov + i, bufs_per_frame); if (rc < 0) { debug("tap write: %s", strerror(errno)); @@ -340,32 +347,37 @@ static size_t tap_send_frames_pasta(const struct ctx *c, default: die("Write error on tap device, exiting"); } - } else if ((size_t)rc < iov[i].iov_len) { - debug("short write on tuntap: %zd/%zu", - rc, iov[i].iov_len); + } else if ((size_t)rc < framelen) { + debug("short write on tuntap: %zd/%zu", rc, framelen); break; } } - return i; + return i / bufs_per_frame; } /** * tap_send_frames_passt() - Send multiple frames to the passt tap - * @c: Execution context - * @iov: Array of buffers, each containing one frame - * @n: Number of buffers/frames in @iov + * @c: Execution context + * @iov: Array of buffers, each containing one frame + * @bufs_per_frame: Number of buffers (iovec entries) per frame + * @nframes: Number of frames to send * + * @iov must have total length @bufs_per_frame * @nframes, with each set of + * @bufs_per_frame contiguous buffers representing a single frame. + * * Return: number of frames successfully sent * * #syscalls:passt sendmsg */ static size_t tap_send_frames_passt(const struct ctx *c, - const struct iovec *iov, size_t n) + const struct iovec *iov, + size_t bufs_per_frame, size_t nframes) { + size_t nbufs = bufs_per_frame * nframes; struct msghdr mh = { .msg_iov = (void *)iov, - .msg_iovlen = n, + .msg_iovlen = nbufs, }; size_t buf_offset; unsigned int i; @@ -376,44 +388,53 @@ static size_t tap_send_frames_passt(const struct ctx *c, return 0; /* Check for any partial frames due to short send */ - i = iov_skip_bytes(iov, n, sent, &buf_offset); + i = iov_skip_bytes(iov, nbufs, sent, &buf_offset); + + if (i < nbufs && (buf_offset || (i % bufs_per_frame))) { + /* Number of not-fully-sent buffers in the frame */ + size_t rembufs = bufs_per_frame - (i % bufs_per_frame); - if (i < n && buf_offset) { - /* A partial frame was sent */ - if (write_remainder(c->fd_tap, &iov[i], 1, buf_offset) < 0) { + if (write_remainder(c->fd_tap, &iov[i], rembufs, buf_offset) < 0) { err("tap: partial frame send: %s", strerror(errno)); return i; } - i++; + i += rembufs; } - return i; + return i / bufs_per_frame; } /** * tap_send_frames() - Send out multiple prepared frames - * @c: Execution context - * @iov: Array of buffers, each containing one frame (with L2 headers) - * @n: Number of buffers/frames in @iov + * @c: Execution context + * @iov: Array of buffers, each containing one frame (with L2 headers) + * @bufs_per_frame: Number of buffers (iovec entries) per frame + * @nframes: Number of frames to send + * + * @iov must have total length @bufs_per_frame * @nframes, with each set of + * @bufs_per_frame contiguous buffers representing a single frame. * * Return: number of frames actually sent */ -size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n) +size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, + size_t bufs_per_frame, size_t nframes) { size_t m; - if (!n) + if (!nframes) return 0; if (c->mode == MODE_PASST) - m = tap_send_frames_passt(c, iov, n); + m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); else - m = tap_send_frames_pasta(c, iov, n); + m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes); - if (m < n) - debug("tap: failed to send %zu frames of %zu", n - m, n); + if (m < nframes) + debug("tap: failed to send %zu frames of %zu", + nframes - m, nframes); - pcap_multiple(iov, 1, m, c->mode == MODE_PASST ? sizeof(uint32_t) : 0); + pcap_multiple(iov, bufs_per_frame, m, + c->mode == MODE_PASST ? sizeof(uint32_t) : 0); return m; } diff --git a/tap.h b/tap.h index 437b9aa2..c45aab3e 100644 --- a/tap.h +++ b/tap.h @@ -73,7 +73,8 @@ void tap_icmp6_send(const struct ctx *c, const struct in6_addr *src, const struct in6_addr *dst, const void *in, size_t len); int tap_send(const struct ctx *c, const void *data, size_t len); -size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t n); +size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, + size_t bufs_per_frame, size_t nframes); void eth_update_mac(struct ethhdr *eh, const unsigned char *eth_d, const unsigned char *eth_s); void tap_listen_handler(struct ctx *c, uint32_t events); diff --git a/tcp.c b/tcp.c index d5eedf4d..9d90108b 100644 --- a/tcp.c +++ b/tcp.c @@ -1289,10 +1289,10 @@ static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); */ static void tcp_l2_flags_buf_flush(const struct ctx *c) { - tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); + tap_send_frames(c, tcp6_l2_flags_iov, 1, tcp6_l2_flags_buf_used); tcp6_l2_flags_buf_used = 0; - tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); + tap_send_frames(c, tcp4_l2_flags_iov, 1, tcp4_l2_flags_buf_used); tcp4_l2_flags_buf_used = 0; } @@ -1305,12 +1305,12 @@ static void tcp_l2_data_buf_flush(const struct ctx *c) unsigned i; size_t m; - m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); + m = tap_send_frames(c, tcp6_l2_iov, 1, tcp6_l2_buf_used); for (i = 0; i < m; i++) *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; tcp6_l2_buf_used = 0; - m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); + m = tap_send_frames(c, tcp4_l2_iov, 1, tcp4_l2_buf_used); for (i = 0; i < m; i++) *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; tcp4_l2_buf_used = 0; diff --git a/udp.c b/udp.c index 45b7cc96..cba595c9 100644 --- a/udp.c +++ b/udp.c @@ -712,7 +712,7 @@ static void udp_tap_send(const struct ctx *c, tap_iov[i].iov_len = buf_len; } - tap_send_frames(c, tap_iov + start, n); + tap_send_frames(c, tap_iov + start, 1, n); } /** -- 2.44.0
On Fri, 8 Mar 2024 17:53:22 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:tap_send_frames() takes a vector of buffers and requires exactly one frame per buffer. We have future plans where we want to have multiple buffers per frame in some circumstances, so extend tap_send_frames() to take the number of buffers per frame as a parameter. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 83 +++++++++++++++++++++++++++++++++++++---------------------- tap.h | 3 ++- tcp.c | 8 +++--- udp.c | 2 +- 4 files changed, 59 insertions(+), 37 deletions(-) diff --git a/tap.c b/tap.c index f4051cec..f9e2a8d9 100644 --- a/tap.c +++ b/tap.c @@ -309,21 +309,28 @@ void tap_icmp6_send(const struct ctx *c, /** * tap_send_frames_pasta() - Send multiple frames to the pasta tap - * @c: Execution context - * @iov: Array of buffers, each containing one frame - * @n: Number of buffers/frames in @iov + * @c: Execution context + * @iov: Array of buffers + * @bufs_per_frame: Number of buffers (iovec entries) per frame + * @nframes: Number of frames to send * + * @iov must have total length @bufs_per_frame * @nframes, with each set of + * @bufs_per_frame contiguous buffers representing a single frame.Oh, this does pretty much what I was suggesting as a comment to Laurent's "tcp: Replace TCP buffer structure by an iovec array" -- I should have reviewed this first.+ * * Return: number of frames successfully sent * * #syscalls:pasta write */ static size_t tap_send_frames_pasta(const struct ctx *c, - const struct iovec *iov, size_t n) + const struct iovec *iov, + size_t bufs_per_frame, size_t nframes) { + size_t nbufs = bufs_per_frame * nframes; size_t i; - for (i = 0; i < n; i++) { - ssize_t rc = write(c->fd_tap, iov[i].iov_base, iov[i].iov_len); + for (i = 0; i < nbufs; i += bufs_per_frame) { + ssize_t rc = writev(c->fd_tap, iov + i, bufs_per_frame); + size_t framelen = iov_size(iov + i, bufs_per_frame); if (rc < 0) { debug("tap write: %s", strerror(errno)); @@ -340,32 +347,37 @@ static size_t tap_send_frames_pasta(const struct ctx *c, default: die("Write error on tap device, exiting"); } - } else if ((size_t)rc < iov[i].iov_len) { - debug("short write on tuntap: %zd/%zu", - rc, iov[i].iov_len); + } else if ((size_t)rc < framelen) { + debug("short write on tuntap: %zd/%zu", rc, framelen); break; } } - return i; + return i / bufs_per_frame; } /** * tap_send_frames_passt() - Send multiple frames to the passt tap - * @c: Execution context - * @iov: Array of buffers, each containing one frame - * @n: Number of buffers/frames in @iov + * @c: Execution context + * @iov: Array of buffers, each containing one frame + * @bufs_per_frame: Number of buffers (iovec entries) per frame + * @nframes: Number of frames to send * + * @iov must have total length @bufs_per_frame * @nframes, with each set of + * @bufs_per_frame contiguous buffers representing a single frame. + * * Return: number of frames successfully sent * * #syscalls:passt sendmsg */ static size_t tap_send_frames_passt(const struct ctx *c, - const struct iovec *iov, size_t n) + const struct iovec *iov, + size_t bufs_per_frame, size_t nframes) { + size_t nbufs = bufs_per_frame * nframes; struct msghdr mh = { .msg_iov = (void *)iov, - .msg_iovlen = n, + .msg_iovlen = nbufs, }; size_t buf_offset; unsigned int i; @@ -376,44 +388,53 @@ static size_t tap_send_frames_passt(const struct ctx *c, return 0; /* Check for any partial frames due to short send */ - i = iov_skip_bytes(iov, n, sent, &buf_offset); + i = iov_skip_bytes(iov, nbufs, sent, &buf_offset); + + if (i < nbufs && (buf_offset || (i % bufs_per_frame))) { + /* Number of not-fully-sent buffers in the frame */Strictly speaking, this comment is correct, but "not-fully-sent" seems to imply that rembufs only counts partially sent buffers. It also counts the ones that weren't sent at all. What about: /* Number of partially sent or not sent buffers for the frame */ ? The rest of the series looks good to me, I can change this text on merge if you like the proposal, or even apply it as it is (well, it's correct, after all). -- Stefano
On Thu, Mar 14, 2024 at 08:02:17AM +0100, Stefano Brivio wrote:On Fri, 8 Mar 2024 17:53:22 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Right.tap_send_frames() takes a vector of buffers and requires exactly one frame per buffer. We have future plans where we want to have multiple buffers per frame in some circumstances, so extend tap_send_frames() to take the number of buffers per frame as a parameter. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 83 +++++++++++++++++++++++++++++++++++++---------------------- tap.h | 3 ++- tcp.c | 8 +++--- udp.c | 2 +- 4 files changed, 59 insertions(+), 37 deletions(-) diff --git a/tap.c b/tap.c index f4051cec..f9e2a8d9 100644 --- a/tap.c +++ b/tap.c @@ -309,21 +309,28 @@ void tap_icmp6_send(const struct ctx *c, /** * tap_send_frames_pasta() - Send multiple frames to the pasta tap - * @c: Execution context - * @iov: Array of buffers, each containing one frame - * @n: Number of buffers/frames in @iov + * @c: Execution context + * @iov: Array of buffers + * @bufs_per_frame: Number of buffers (iovec entries) per frame + * @nframes: Number of frames to send * + * @iov must have total length @bufs_per_frame * @nframes, with each set of + * @bufs_per_frame contiguous buffers representing a single frame.Oh, this does pretty much what I was suggesting as a comment to Laurent's "tcp: Replace TCP buffer structure by an iovec array" -- I should have reviewed this first.Yeah, the original is technically correct, but easy to misread. Maybe Number of unsent or partially sent buffers for the frame for slightly more brevity than your suggestion?+ * * Return: number of frames successfully sent * * #syscalls:pasta write */ static size_t tap_send_frames_pasta(const struct ctx *c, - const struct iovec *iov, size_t n) + const struct iovec *iov, + size_t bufs_per_frame, size_t nframes) { + size_t nbufs = bufs_per_frame * nframes; size_t i; - for (i = 0; i < n; i++) { - ssize_t rc = write(c->fd_tap, iov[i].iov_base, iov[i].iov_len); + for (i = 0; i < nbufs; i += bufs_per_frame) { + ssize_t rc = writev(c->fd_tap, iov + i, bufs_per_frame); + size_t framelen = iov_size(iov + i, bufs_per_frame); if (rc < 0) { debug("tap write: %s", strerror(errno)); @@ -340,32 +347,37 @@ static size_t tap_send_frames_pasta(const struct ctx *c, default: die("Write error on tap device, exiting"); } - } else if ((size_t)rc < iov[i].iov_len) { - debug("short write on tuntap: %zd/%zu", - rc, iov[i].iov_len); + } else if ((size_t)rc < framelen) { + debug("short write on tuntap: %zd/%zu", rc, framelen); break; } } - return i; + return i / bufs_per_frame; } /** * tap_send_frames_passt() - Send multiple frames to the passt tap - * @c: Execution context - * @iov: Array of buffers, each containing one frame - * @n: Number of buffers/frames in @iov + * @c: Execution context + * @iov: Array of buffers, each containing one frame + * @bufs_per_frame: Number of buffers (iovec entries) per frame + * @nframes: Number of frames to send * + * @iov must have total length @bufs_per_frame * @nframes, with each set of + * @bufs_per_frame contiguous buffers representing a single frame. + * * Return: number of frames successfully sent * * #syscalls:passt sendmsg */ static size_t tap_send_frames_passt(const struct ctx *c, - const struct iovec *iov, size_t n) + const struct iovec *iov, + size_t bufs_per_frame, size_t nframes) { + size_t nbufs = bufs_per_frame * nframes; struct msghdr mh = { .msg_iov = (void *)iov, - .msg_iovlen = n, + .msg_iovlen = nbufs, }; size_t buf_offset; unsigned int i; @@ -376,44 +388,53 @@ static size_t tap_send_frames_passt(const struct ctx *c, return 0; /* Check for any partial frames due to short send */ - i = iov_skip_bytes(iov, n, sent, &buf_offset); + i = iov_skip_bytes(iov, nbufs, sent, &buf_offset); + + if (i < nbufs && (buf_offset || (i % bufs_per_frame))) { + /* Number of not-fully-sent buffers in the frame */Strictly speaking, this comment is correct, but "not-fully-sent" seems to imply that rembufs only counts partially sent buffers. It also counts the ones that weren't sent at all. What about: /* Number of partially sent or not sent buffers for the frame */ ?The rest of the series looks good to me, I can change this text on merge if you like the proposal, or even apply it as it is (well, it's correct, after all).Yes, please adjust and go ahead. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
We can both remove some variables which differ from others only in type, and slightly improve type safety. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/tap.c b/tap.c index f9e2a8d9..38965842 100644 --- a/tap.c +++ b/tap.c @@ -146,11 +146,9 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto) * * Return: pointer at which to write the packet's payload */ -static void *tap_push_ip4h(char *buf, struct in_addr src, struct in_addr dst, - size_t len, uint8_t proto) +static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, + struct in_addr dst, size_t len, uint8_t proto) { - struct iphdr *ip4h = (struct iphdr *)buf; - ip4h->version = 4; ip4h->ihl = sizeof(struct iphdr) / 4; ip4h->tos = 0; @@ -181,9 +179,8 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, { size_t udplen = len + sizeof(struct udphdr); char buf[USHRT_MAX]; - void *ip4h = tap_push_l2h(c, buf, ETH_P_IP); - void *uhp = tap_push_ip4h(ip4h, src, dst, udplen, IPPROTO_UDP); - struct udphdr *uh = (struct udphdr *)uhp; + struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); + struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, udplen, IPPROTO_UDP); char *data = (char *)(uh + 1); uh->source = htons(sport); @@ -208,14 +205,14 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, const void *in, size_t len) { char buf[USHRT_MAX]; - void *ip4h = tap_push_l2h(c, buf, ETH_P_IP); - char *data = tap_push_ip4h(ip4h, src, dst, len, IPPROTO_ICMP); - struct icmphdr *icmp4h = (struct icmphdr *)data; + struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP); + struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst, + len, IPPROTO_ICMP); - memcpy(data, in, len); + memcpy(icmp4h, in, len); csum_icmp4(icmp4h, icmp4h + 1, len - sizeof(*icmp4h)); - if (tap_send(c, buf, len + (data - buf)) < 0) + if (tap_send(c, buf, len + ((char *)icmp4h - buf)) < 0) debug("tap: failed to send %zu bytes (IPv4)", len); } @@ -230,13 +227,11 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, * * Return: pointer at which to write the packet's payload */ -static void *tap_push_ip6h(char *buf, +static void *tap_push_ip6h(struct ipv6hdr *ip6h, const struct in6_addr *src, const struct in6_addr *dst, size_t len, uint8_t proto, uint32_t flow) { - struct ipv6hdr *ip6h = (struct ipv6hdr *)buf; - ip6h->payload_len = htons(len); ip6h->priority = 0; ip6h->version = 6; @@ -268,9 +263,9 @@ void tap_udp6_send(const struct ctx *c, { size_t udplen = len + sizeof(struct udphdr); char buf[USHRT_MAX]; - void *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); - void *uhp = tap_push_ip6h(ip6h, src, dst, udplen, IPPROTO_UDP, flow); - struct udphdr *uh = (struct udphdr *)uhp; + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); + struct udphdr *uh = tap_push_ip6h(ip6h, src, dst, + udplen, IPPROTO_UDP, flow); char *data = (char *)(uh + 1); uh->source = htons(sport); @@ -296,14 +291,14 @@ void tap_icmp6_send(const struct ctx *c, const void *in, size_t len) { char buf[USHRT_MAX]; - void *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); - char *data = tap_push_ip6h(ip6h, src, dst, len, IPPROTO_ICMPV6, 0); - struct icmp6hdr *icmp6h = (struct icmp6hdr *)data; + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6); + struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, len, + IPPROTO_ICMPV6, 0); - memcpy(data, in, len); + memcpy(icmp6h, in, len); csum_icmp6(icmp6h, src, dst, icmp6h + 1, len - sizeof(*icmp6h)); - if (tap_send(c, buf, len + (data - buf)) < 1) + if (tap_send(c, buf, len + ((char *)icmp6h - buf)) < 1) debug("tap: failed to send %zu bytes (IPv6)", len); } -- 2.44.0
Most times we send frames to the guest it goes via tap_send_frames(). However "slow path" protocols - ARP, ICMP, ICMPv6, DHCP and DHCPv6 - go via tap_send(). As well as being a semantic duplication, tap_send() contains at least one serious problem: it doesn't properly handle short sends, which can be fatal on the qemu socket connection, since frame boundaries will get out of sync. Rewrite tap_send() to call tap_send_frames(). While we're there, rename it tap_send_single() for clarity. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- arp.c | 4 +--- tap.c | 38 +++++++++++++++++--------------------- tap.h | 2 +- 3 files changed, 19 insertions(+), 25 deletions(-) diff --git a/arp.c b/arp.c index a35c1b61..113cda2f 100644 --- a/arp.c +++ b/arp.c @@ -44,7 +44,6 @@ int arp(const struct ctx *c, const struct pool *p) struct arphdr *ah; struct arpmsg *am; size_t len; - int ret; eh = packet_get(p, 0, 0, sizeof(*eh), NULL); ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL); @@ -83,8 +82,7 @@ int arp(const struct ctx *c, const struct pool *p) memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest)); memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); - if ((ret = tap_send(c, eh, len)) < 0) - warn("ARP: send: %s", strerror(ret)); + tap_send_single(c, eh, len); return 1; } diff --git a/tap.c b/tap.c index 38965842..5e3c6b13 100644 --- a/tap.c +++ b/tap.c @@ -67,28 +67,28 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ /** - * tap_send() - Send frame, with qemu socket header if needed + * tap_send_single() - Send a single frame * @c: Execution context * @data: Packet buffer * @len: Total L2 packet length - * - * Return: return code from send() or write() */ -int tap_send(const struct ctx *c, const void *data, size_t len) +void tap_send_single(const struct ctx *c, const void *data, size_t len) { - pcap(data, len); + uint32_t vnet_len = htonl(len); + struct iovec iov[2]; + size_t iovcnt = 0; if (c->mode == MODE_PASST) { - int flags = MSG_NOSIGNAL | MSG_DONTWAIT; - uint32_t vnet_len = htonl(len); - - if (send(c->fd_tap, &vnet_len, 4, flags) < 0) - return -1; - - return send(c->fd_tap, data, len, flags); + iov[iovcnt].iov_base = &vnet_len; + iov[iovcnt].iov_len = sizeof(vnet_len); + iovcnt++; } - return write(c->fd_tap, (char *)data, len); + iov[iovcnt].iov_base = (void *)data; + iov[iovcnt].iov_len = len; + iovcnt++; + + tap_send_frames(c, iov, iovcnt, 1); } /** @@ -189,8 +189,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, csum_udp4(uh, src, dst, in, len); memcpy(data, in, len); - if (tap_send(c, buf, len + (data - buf)) < 0) - debug("tap: failed to send %zu bytes (IPv4)", len); + tap_send_single(c, buf, len + (data - buf)); } /** @@ -212,8 +211,7 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst, memcpy(icmp4h, in, len); csum_icmp4(icmp4h, icmp4h + 1, len - sizeof(*icmp4h)); - if (tap_send(c, buf, len + ((char *)icmp4h - buf)) < 0) - debug("tap: failed to send %zu bytes (IPv4)", len); + tap_send_single(c, buf, len + ((char *)icmp4h - buf)); } /** @@ -274,8 +272,7 @@ void tap_udp6_send(const struct ctx *c, csum_udp6(uh, src, dst, in, len); memcpy(data, in, len); - if (tap_send(c, buf, len + (data - buf)) < 1) - debug("tap: failed to send %zu bytes (IPv6)", len); + tap_send_single(c, buf, len + (data - buf)); } /** @@ -298,8 +295,7 @@ void tap_icmp6_send(const struct ctx *c, memcpy(icmp6h, in, len); csum_icmp6(icmp6h, src, dst, icmp6h + 1, len - sizeof(*icmp6h)); - if (tap_send(c, buf, len + ((char *)icmp6h - buf)) < 1) - debug("tap: failed to send %zu bytes (IPv6)", len); + tap_send_single(c, buf, len + ((char *)icmp6h - buf)); } /** diff --git a/tap.h b/tap.h index c45aab3e..aa3b1af2 100644 --- a/tap.h +++ b/tap.h @@ -72,7 +72,7 @@ void tap_udp6_send(const struct ctx *c, void tap_icmp6_send(const struct ctx *c, const struct in6_addr *src, const struct in6_addr *dst, const void *in, size_t len); -int tap_send(const struct ctx *c, const void *data, size_t len); +void tap_send_single(const struct ctx *c, const void *data, size_t len); size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, size_t bufs_per_frame, size_t nframes); void eth_update_mac(struct ethhdr *eh, -- 2.44.0
These two functions are typically used to calculate values to go into the iov_base and iov_len fields of a struct iovec. They don't have to be used for that, though. Rename them in terms of what they actually do: calculate the base address and total length of the complete frame, including both L2 and tap specific headers. Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> --- tap.h | 14 +++++++------- tcp.c | 12 ++++++------ udp.c | 8 ++++---- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tap.h b/tap.h index aa3b1af2..2adc4e2b 100644 --- a/tap.h +++ b/tap.h @@ -27,30 +27,30 @@ static inline size_t tap_hdr_len_(const struct ctx *c) } /** - * tap_iov_base() - Find start of tap frame + * tap_frame_base() - Find start of tap frame * @c: Execution context - * @taph: Pointer to L2 header buffer + * @taph: Pointer to L2 and tap specific header buffer * * Returns: pointer to the start of tap frame - suitable for an * iov_base to be passed to tap_send_frames()) */ -static inline void *tap_iov_base(const struct ctx *c, struct tap_hdr *taph) +static inline void *tap_frame_base(const struct ctx *c, struct tap_hdr *taph) { return (char *)(taph + 1) - tap_hdr_len_(c); } /** - * tap_iov_len() - Finalize tap frame and return total length + * tap_frame_len() - Finalize tap frame and return total length * @c: Execution context * @taph: Tap header to finalize - * @plen: L2 payload length (excludes L2 and tap specific headers) + * @plen: L3 packet length (excludes L2 and tap specific headers) * * Returns: length of the tap frame including L2 and tap specific * headers - suitable for an iov_len to be passed to * tap_send_frames() */ -static inline size_t tap_iov_len(const struct ctx *c, struct tap_hdr *taph, - size_t plen) +static inline size_t tap_frame_len(const struct ctx *c, struct tap_hdr *taph, + size_t plen) { if (c->mode == MODE_PASST) taph->vnet_len = htonl(plen + sizeof(taph->eh)); diff --git a/tcp.c b/tcp.c index 9d90108b..a1860d10 100644 --- a/tcp.c +++ b/tcp.c @@ -1014,10 +1014,10 @@ static void tcp_sock4_iov_init(const struct ctx *c) } for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph); + iov->iov_base = tap_frame_base(c, &tcp4_l2_buf[i].taph); for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph); + iov->iov_base = tap_frame_base(c, &tcp4_l2_flags_buf[i].taph); } /** @@ -1045,10 +1045,10 @@ static void tcp_sock6_iov_init(const struct ctx *c) } for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph); + iov->iov_base = tap_frame_base(c, &tcp6_l2_buf[i].taph); for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) - iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph); + iov->iov_base = tap_frame_base(c, &tcp6_l2_flags_buf[i].taph); } /** @@ -1454,14 +1454,14 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, ip_len = tcp_fill_headers4(c, conn, &b->iph, &b->th, plen, check, seq); - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = tap_frame_len(c, &b->taph, ip_len); } else { struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; ip_len = tcp_fill_headers6(c, conn, &b->ip6h, &b->th, plen, seq); - tlen = tap_iov_len(c, &b->taph, ip_len); + tlen = tap_frame_len(c, &b->taph, ip_len); } return tlen; diff --git a/udp.c b/udp.c index cba595c9..0a7f3b78 100644 --- a/udp.c +++ b/udp.c @@ -318,7 +318,7 @@ static void udp_sock4_iov_init_one(const struct ctx *c, size_t i) mh->msg_iov = siov; mh->msg_iovlen = 1; - tiov->iov_base = tap_iov_base(c, &buf->taph); + tiov->iov_base = tap_frame_base(c, &buf->taph); } /** @@ -346,7 +346,7 @@ static void udp_sock6_iov_init_one(const struct ctx *c, size_t i) mh->msg_iov = siov; mh->msg_iovlen = 1; - tiov->iov_base = tap_iov_base(c, &buf->taph); + tiov->iov_base = tap_frame_base(c, &buf->taph); } /** @@ -606,7 +606,7 @@ static size_t udp_update_hdr4(const struct ctx *c, struct udp4_l2_buf_t *b, b->uh.dest = htons(dstport); b->uh.len = htons(datalen + sizeof(b->uh)); - return tap_iov_len(c, &b->taph, ip_len); + return tap_frame_len(c, &b->taph, ip_len); } /** @@ -673,7 +673,7 @@ static size_t udp_update_hdr6(const struct ctx *c, struct udp6_l2_buf_t *b, b->uh.len = b->ip6h.payload_len; csum_udp6(&b->uh, src, dst, b->data, datalen); - return tap_iov_len(c, &b->taph, payload_len + sizeof(b->ip6h)); + return tap_frame_len(c, &b->taph, payload_len + sizeof(b->ip6h)); } /** -- 2.44.0
On 3/8/24 07:53, David Gibson wrote:This series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end.I'm working on patch that devides TCP buffers in several buffers pointed out by an IOV arrays and then provided to tap_send_frames(). I'm going to base my patch on this series. The idea is: A frame is made with 4 iovecs: #define TCP_IOV_VNET 0 #define TCP_IOV_ETH 1 #define TCP_IOV_IP 2 #define TCP_IOV_PAYLOAD 3 #define TCP_IOV_NUM 4 typedef struct iovec tap_iovec_t[TCP_IOV_NUM]; The payload can be TCP + data or TCP + flags: struct tcp_l2_flags_t { struct tcphdr th; char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; }; struct tcp_l2_payload_t { struct tcphdr th; /* 20 bytes */ uint8_t data[MSS]; /* 65516 bytes */ #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))); #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif static struct ethhdr tcp4_eth_src; static struct iphdr tcp4_l2_ip[TCP_FRAMES_MEM]; static struct tcp_l2_payload_t tcp4_l2_payload[TCP_FRAMES_MEM]; static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp4_l2_buf_used; static tap_iovec_t tcp4_l2_iov [TCP_FRAMES_MEM]; Initialization looks like: tcp4_eth_src.h_proto = htons_constant(ETH_P_IP); for (i = 0; i < TCP_FRAMES_MEM; i++) { ... /* headers */ tcp4_l2_ip[i] = iph; tcp4_l2_payload[i].th = (struct tcphdr){ .doff = sizeof(struct tcphdr) / 4, .ack = 1 }; ... /* iovecs */ iov = tcp4_l2_iov[i]; iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); iov[TCP_IOV_IP].iov_base = &tcp4_l2_ip[i]; iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_l2_payload[i]; ... } Then to fill the payload header (data are received by tcp_data_from_sock()): iov = tcp4_l2_iov[tcp4_l2_buf_used++]; iov[TCP_IOV_PAYLOAD].iov_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check, seq); And the frame is sent using: m = tap_send_iov(c, tcp4_l2_iov, tcp4_l2_buf_used); For the moment (I'll rebase on your series), tap_send_iov_passt() looks like: static size_t tap_send_iov_passt(const struct ctx *c, tap_iovec_t *tap_iov, size_t n) { unsigned int i; for (i = 0; i < n; i++) { uint32_t vnet_len; int j; vnet_len = 0; for (j = TCP_IOV_ETH; j < TCP_IOV_NUM; j++) vnet_len += tap_iov[i][j].iov_len; tap_iov[i][TCP_IOV_VNET].iov_base = &vnet_len; tap_iov[i][TCP_IOV_VNET].iov_len = sizeof(vnet_len); if (!tap_send_frames_passt(c, tap_iov[i], TCP_IOV_NUM)) break; } return i; } Thanks, LaurentThis is based on Laurent's patch fixing pcap_multiple() not to capture frames we failed to send. David Gibson (4): tap: Extend tap_send_frames() to allow multi-buffer frames tap: Simplify some casts in the tap "slow path" functions tap: Implement tap_send() "slow path" in terms of fast path tap: Rename tap_iov_{base,len} arp.c | 4 +- tap.c | 158 +++++++++++++++++++++++++++++++--------------------------- tap.h | 19 +++---- tcp.c | 20 ++++---- udp.c | 10 ++-- 5 files changed, 111 insertions(+), 100 deletions(-)
On Fri, 8 Mar 2024 09:18:48 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:On 3/8/24 07:53, David Gibson wrote:Except for the typedef :) (I'm actively trying to discourage them) ...this looks very neat. I would suggest that as soon as you have something barely spitting out pseudo-correct bytes, you give it a try with perf(1). I'm quite concerned that we might end up killing throughput, especially without vhost-user, even though in theory it all sounds nice and byte-saving. -- StefanoThis series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end.I'm working on patch that devides TCP buffers in several buffers pointed out by an IOV arrays and then provided to tap_send_frames(). I'm going to base my patch on this series. The idea is: A frame is made with 4 iovecs: #define TCP_IOV_VNET 0 #define TCP_IOV_ETH 1 #define TCP_IOV_IP 2 #define TCP_IOV_PAYLOAD 3 #define TCP_IOV_NUM 4 typedef struct iovec tap_iovec_t[TCP_IOV_NUM];
On 3/8/24 09:34, Stefano Brivio wrote:On Fri, 8 Mar 2024 09:18:48 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:ok...On 3/8/24 07:53, David Gibson wrote:Except for the typedef :) (I'm actively trying to discourage them)This series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end.I'm working on patch that devides TCP buffers in several buffers pointed out by an IOV arrays and then provided to tap_send_frames(). I'm going to base my patch on this series. The idea is: A frame is made with 4 iovecs: #define TCP_IOV_VNET 0 #define TCP_IOV_ETH 1 #define TCP_IOV_IP 2 #define TCP_IOV_PAYLOAD 3 #define TCP_IOV_NUM 4 typedef struct iovec tap_iovec_t[TCP_IOV_NUM];...this looks very neat. I would suggest that as soon as you have something barely spitting out pseudo-correct bytes, you give it a try with perf(1). I'm quite concerned that we might end up killing throughput, especially without vhost-user, even though in theory it all sounds nice and byte-saving.I hope to have something to test today... Thanks, Laurent
On 3/8/24 09:34, Stefano Brivio wrote:On Fri, 8 Mar 2024 09:18:48 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:Using iovec improves performance: iperf3 -c localhost -p 10001 -t 60 -4 iovec [ 5] 0.00-60.00 sec 32.9 GBytes 4.72 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 32.9 GBytes 4.71 Gbits/sec receiver buf_t [ 5] 0.00-60.00 sec 15.9 GBytes 2.27 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 15.9 GBytes 2.27 Gbits/sec receiver iperf3 -c localhost -p 10001 -t 60 -6 iovec [ 5] 0.00-60.00 sec 26.0 GBytes 3.73 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 26.0 GBytes 3.72 Gbits/sec receiver buf_t [ 5] 0.00-60.00 sec 15.7 GBytes 2.25 Gbits/sec 0 sender [ 5] 0.00-60.07 sec 15.7 GBytes 2.25 Gbits/sec receiver Thanks, LaurentOn 3/8/24 07:53, David Gibson wrote:Except for the typedef :) (I'm actively trying to discourage them) ...this looks very neat. I would suggest that as soon as you have something barely spitting out pseudo-correct bytes, you give it a try with perf(1). I'm quite concerned that we might end up killing throughput, especially without vhost-user, even though in theory it all sounds nice and byte-saving.This series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end.I'm working on patch that devides TCP buffers in several buffers pointed out by an IOV arrays and then provided to tap_send_frames(). I'm going to base my patch on this series. The idea is: A frame is made with 4 iovecs: #define TCP_IOV_VNET 0 #define TCP_IOV_ETH 1 #define TCP_IOV_IP 2 #define TCP_IOV_PAYLOAD 3 #define TCP_IOV_NUM 4 typedef struct iovec tap_iovec_t[TCP_IOV_NUM];
On Fri, 8 Mar 2024 16:49:20 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:On 3/8/24 09:34, Stefano Brivio wrote:! ...it must be some alignment matter. Nice. I guess it doesn't happen with large windows (-w ...) or messages (-l 1M), but it looks promising enough with small ones. -- StefanoOn Fri, 8 Mar 2024 09:18:48 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:Using iovec improves performance: iperf3 -c localhost -p 10001 -t 60 -4 iovec [ 5] 0.00-60.00 sec 32.9 GBytes 4.72 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 32.9 GBytes 4.71 Gbits/sec receiver buf_t [ 5] 0.00-60.00 sec 15.9 GBytes 2.27 Gbits/sec 0 sender [ 5] 0.00-60.06 sec 15.9 GBytes 2.27 Gbits/sec receiverOn 3/8/24 07:53, David Gibson wrote:Except for the typedef :) (I'm actively trying to discourage them) ...this looks very neat. I would suggest that as soon as you have something barely spitting out pseudo-correct bytes, you give it a try with perf(1). I'm quite concerned that we might end up killing throughput, especially without vhost-user, even though in theory it all sounds nice and byte-saving.This series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end.I'm working on patch that devides TCP buffers in several buffers pointed out by an IOV arrays and then provided to tap_send_frames(). I'm going to base my patch on this series. The idea is: A frame is made with 4 iovecs: #define TCP_IOV_VNET 0 #define TCP_IOV_ETH 1 #define TCP_IOV_IP 2 #define TCP_IOV_PAYLOAD 3 #define TCP_IOV_NUM 4 typedef struct iovec tap_iovec_t[TCP_IOV_NUM];
On Fri, Mar 08, 2024 at 09:18:48AM +0100, Laurent Vivier wrote:On 3/8/24 07:53, David Gibson wrote:General concept seems good. Unless you have a specific reason to do so, I'd suggest keeping VNET and ETH - i.e. L2 and everything below it - together. As well as just making one less buffer for each frame, I think that will make life easier if we want to add an L2 interface with non-Ethernet framing (e.g. "tun" instead of "tap").This series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end.I'm working on patch that devides TCP buffers in several buffers pointed out by an IOV arrays and then provided to tap_send_frames(). I'm going to base my patch on this series. The idea is: A frame is made with 4 iovecs: #define TCP_IOV_VNET 0 #define TCP_IOV_ETH 1 #define TCP_IOV_IP 2 #define TCP_IOV_PAYLOAD 3 #define TCP_IOV_NUM 4 typedef struct iovec tap_iovec_t[TCP_IOV_NUM];The payload can be TCP + data or TCP + flags: struct tcp_l2_flags_t { struct tcphdr th; char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; }; struct tcp_l2_payload_t { struct tcphdr th; /* 20 bytes */ uint8_t data[MSS]; /* 65516 bytes */ #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))); #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endifNot sure if we'd be better off with this approach, or having both IP and L4 headers together, and the L4 payload in another. The latter would mean duplicating the TCP or UDP headers between the IPv4 and IPv6 cases, but it allows the data buffers - which will be used directly on the socket side. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
On 3/8/24 13:42, David Gibson wrote: ...The main idea behind using iovec is to separate tcphdr and iphdr structures, allowing for direct access to the pointers of tcphdr and iphdr without concerns about pointer alignment. Moreover, having tcphdr and TCP payload in the same vector can make sense when computing the TCP checksum, as the checksum includes tcphdr and the payload. Thanks, LaurentThe payload can be TCP + data or TCP + flags: struct tcp_l2_flags_t { struct tcphdr th; char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; }; struct tcp_l2_payload_t { struct tcphdr th; /* 20 bytes */ uint8_t data[MSS]; /* 65516 bytes */ #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))); #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endifNot sure if we'd be better off with this approach, or having both IP and L4 headers together, and the L4 payload in another. The latter would mean duplicating the TCP or UDP headers between the IPv4 and IPv6 cases, but it allows the data buffers - which will be used directly on the socket side.
On Fri, Mar 08, 2024 at 05:49:15PM +0100, Laurent Vivier wrote:On 3/8/24 13:42, David Gibson wrote: ...Ah, yes. That makes sense. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonThe main idea behind using iovec is to separate tcphdr and iphdr structures, allowing for direct access to the pointers of tcphdr and iphdr without concerns about pointer alignment. Moreover, having tcphdr and TCP payload in the same vector can make sense when computing the TCP checksum, as the checksum includes tcphdr and the payload.The payload can be TCP + data or TCP + flags: struct tcp_l2_flags_t { struct tcphdr th; char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; }; struct tcp_l2_payload_t { struct tcphdr th; /* 20 bytes */ uint8_t data[MSS]; /* 65516 bytes */ #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))); #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endifNot sure if we'd be better off with this approach, or having both IP and L4 headers together, and the L4 payload in another. The latter would mean duplicating the TCP or UDP headers between the IPv4 and IPv6 cases, but it allows the data buffers - which will be used directly on the socket side.
On 3/8/24 13:42, David Gibson wrote:On Fri, Mar 08, 2024 at 09:18:48AM +0100, Laurent Vivier wrote:In fact keeping vnet header separated from eth header makes easier to remove it from the list to pass the iovec array to pcap functions and to pasta send function (can use iovec[1] and iovcount - 1). Thanks, LaurentOn 3/8/24 07:53, David Gibson wrote:General concept seems good. Unless you have a specific reason to do so, I'd suggest keeping VNET and ETH - i.e. L2 and everything below it - together. As well as just making one less buffer for each frame, I think that will make life easier if we want to add an L2 interface with non-Ethernet framing (e.g. "tun" instead of "tap").This series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end.I'm working on patch that devides TCP buffers in several buffers pointed out by an IOV arrays and then provided to tap_send_frames(). I'm going to base my patch on this series. The idea is: A frame is made with 4 iovecs: #define TCP_IOV_VNET 0 #define TCP_IOV_ETH 1 #define TCP_IOV_IP 2 #define TCP_IOV_PAYLOAD 3 #define TCP_IOV_NUM 4 typedef struct iovec tap_iovec_t[TCP_IOV_NUM];
On Mon, Mar 11, 2024 at 12:02:08PM +0100, Laurent Vivier wrote:On 3/8/24 13:42, David Gibson wrote:Ok, fair point. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonOn Fri, Mar 08, 2024 at 09:18:48AM +0100, Laurent Vivier wrote:In fact keeping vnet header separated from eth header makes easier to remove it from the list to pass the iovec array to pcap functions and to pasta send function (can use iovec[1] and iovcount - 1).On 3/8/24 07:53, David Gibson wrote:General concept seems good. Unless you have a specific reason to do so, I'd suggest keeping VNET and ETH - i.e. L2 and everything below it - together. As well as just making one less buffer for each frame, I think that will make life easier if we want to add an L2 interface with non-Ethernet framing (e.g. "tun" instead of "tap").This series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end.I'm working on patch that devides TCP buffers in several buffers pointed out by an IOV arrays and then provided to tap_send_frames(). I'm going to base my patch on this series. The idea is: A frame is made with 4 iovecs: #define TCP_IOV_VNET 0 #define TCP_IOV_ETH 1 #define TCP_IOV_IP 2 #define TCP_IOV_PAYLOAD 3 #define TCP_IOV_NUM 4 typedef struct iovec tap_iovec_t[TCP_IOV_NUM];
On Fri, 8 Mar 2024 17:53:21 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:This series has a handful of small improvements to the tap send path. See individual commit messages for the details. I expect this will conflict with Laurent's upcoming work. I hope the conflicts won't be too bad, and indeed will set us up for less duplication there in the end. This is based on Laurent's patch fixing pcap_multiple() not to capture frames we failed to send. David Gibson (4): tap: Extend tap_send_frames() to allow multi-buffer frames tap: Simplify some casts in the tap "slow path" functions tap: Implement tap_send() "slow path" in terms of fast path tap: Rename tap_iov_{base,len}Applied, with your new version of the comment to 'rembufs' in 1/4. -- Stefano