On Tue, Mar 26, 2024 at 11:19:22AM +0100, Laurent Vivier wrote:Hi, I compared perf result using this patch and a patch changing tap_send_frames_passt() to: static size_t tap_send_frames_passt(const struct ctx *c, const struct iovec *iov, size_t bufs_per_frame, size_t nframes) { struct msghdr mh = { .msg_iovlen = bufs_per_frame, }; size_t buf_offset; unsigned int i; ssize_t sent; for (i = 0; i < nframes; i++) { unsigned int j; if (bufs_per_frame > 1) { /* if we have more than 1 iovec, the first one is vnet_len */ uint32_t *p = iov[i * bufs_per_frame].iov_base; uint32_t vnet_len = 0; for (j = 1; j < bufs_per_frame; j++) vnet_len += iov[i * bufs_per_frame + j].iov_len; vnet_len = htonl(vnet_len); *p = vnet_len; } mh.msg_iov = (void *)&iov[i * bufs_per_frame]; sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); if (sent < 0) return i; /* Check for any partial frames due to short send */ j = iov_skip_bytes(&iov[i * bufs_per_frame], bufs_per_frame, sent, &buf_offset); if (buf_offset && j < bufs_per_frame) { if (write_remainder(c->fd_tap, &iov[i * bufs_per_frame + j], bufs_per_frame - j, buf_offset) < 0) { err("tap: partial frame send: %s", strerror(errno)); return i; } } } return i; } And the result of 'perf record -e cache-misses' gives: slow 83.95% passt.avx2 passt.avx2 [.] csum_avx2 4.39% passt.avx2 passt.avx2 [.] tap4_handler 2.37% passt.avx2 libc.so.6 [.] __printf_buffer 0.84% passt.avx2 passt.avx2 [.] udp_timer fast 22.15% passt.avx2 passt.avx2 [.] csum_avx2 14.91% passt.avx2 passt.avx2 [.] udp_timer 7.60% passt.avx2 libc.so.6 [.] __printf_buffer 5.10% passt.avx2 passt.avx2 [.] ffslWell.. I *guess* that means we're getting more cache misses in the batched version, as we suspected. I'm a bit mystified as to how to interpret those percentages, though. Is that the percentage of total cache misses that occur in that function? The percentage of times that function generates a cache miss (what if it generates more than one)? Something else.. If this does indicate many more cache misses computing the checksum, I'm still a bit baffled as to what's going on. It doesn't quite fit with the theory I had: the csum_avx2() calls are in the "first loop" in both these scenarios - my theory would suggest more cache misses in the "second loop" instead (in the kernel inside sendmsg()). What happens if you fill in the vnet_len field in the first loop, but still use a sendmsg() per frame, instead of one batched one?From d4b3e12132ceaf5484de215e9c84cbedcbbb8188 Mon Sep 17 00:00:00 2001 From: Laurent Vivier <lvivier(a)redhat.com> Date: Tue, 19 Mar 2024 18:20:20 +0100 Subject: [PATCH] tap: compute vnet_len inside tap_send_frames_passt() Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- tap.c | 49 +++++++++++++++++++++++++++++++++---------------- tcp.c | 39 ++++++++++----------------------------- 2 files changed, 43 insertions(+), 45 deletions(-) diff --git a/tap.c b/tap.c index 13e4da79d690..1096272b411a 100644 --- a/tap.c +++ b/tap.c @@ -74,7 +74,7 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); */ void tap_send_single(const struct ctx *c, const void *data, size_t len) { - uint32_t vnet_len = htonl(len); + uint32_t vnet_len; struct iovec iov[2]; size_t iovcnt = 0; @@ -365,34 +365,51 @@ static size_t tap_send_frames_passt(const struct ctx *c, const struct iovec *iov, size_t bufs_per_frame, size_t nframes) { - size_t nbufs = bufs_per_frame * nframes; struct msghdr mh = { - .msg_iov = (void *)iov, - .msg_iovlen = nbufs, + .msg_iovlen = bufs_per_frame, }; size_t buf_offset; unsigned int i; ssize_t sent; - sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); - if (sent < 0) - return 0; + for (i = 0; i < nframes; i++) { + unsigned int j; + + if (bufs_per_frame > 1) { + /* if we have more than one iovec, the first one is + * vnet_len + */ + uint32_t *p = iov[i * bufs_per_frame].iov_base; + uint32_t vnet_len = 0; - /* Check for any partial frames due to short send */ - i = iov_skip_bytes(iov, nbufs, sent, &buf_offset); + for (j = 1; j < bufs_per_frame; j++) + vnet_len += iov[i * bufs_per_frame + j].iov_len; + vnet_len = htonl(vnet_len); + + *p = vnet_len; + } - if (i < nbufs && (buf_offset || (i % bufs_per_frame))) { - /* Number of unsent or partially sent buffers for the frame */ - size_t rembufs = bufs_per_frame - (i % bufs_per_frame); + mh.msg_iov = (void *)&iov[i * bufs_per_frame]; - if (write_remainder(c->fd_tap, &iov[i], rembufs, buf_offset) < 0) { - err("tap: partial frame send: %s", strerror(errno)); + sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); + if (sent < 0) return i; + + /* Check for any partial frames due to short send */ + j = iov_skip_bytes(&iov[i * bufs_per_frame], bufs_per_frame, sent, &buf_offset); + + if (buf_offset && j < bufs_per_frame) { + if (write_remainder(c->fd_tap, &iov[i * bufs_per_frame + j], + bufs_per_frame - j, + buf_offset) < 0) { + err("tap: partial frame send: %s", + strerror(errno)); + return i; + } } - i += rembufs; } - return i / bufs_per_frame; + return i; } /** diff --git a/tcp.c b/tcp.c index cc705064f059..d147e2c41648 100644 --- a/tcp.c +++ b/tcp.c @@ -443,10 +443,11 @@ struct tcp_flags_t { } __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif +static uint32_t vnet_len; + /* Ethernet header for IPv4 frames */ static struct ethhdr tcp4_eth_src; -static uint32_t tcp4_payload_vnet_len[TCP_FRAMES_MEM]; /* IPv4 headers */ static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM]; /* TCP headers and data for IPv4 frames */ @@ -457,7 +458,6 @@ static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516" static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp4_payload_used; -static uint32_t tcp4_flags_vnet_len[TCP_FRAMES_MEM]; /* IPv4 headers for TCP option flags frames */ static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM]; /* TCP headers and option flags for IPv4 frames */ @@ -468,7 +468,6 @@ static unsigned int tcp4_flags_used; /* Ethernet header for IPv6 frames */ static struct ethhdr tcp6_eth_src; -static uint32_t tcp6_payload_vnet_len[TCP_FRAMES_MEM]; /* IPv6 headers */ static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM]; /* TCP headers and data for IPv6 frames */ @@ -479,7 +478,6 @@ static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516" static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_payload_used; -static uint32_t tcp6_flags_vnet_len[TCP_FRAMES_MEM]; /* IPv6 headers for TCP option flags frames */ static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM]; /* TCP headers and option flags for IPv6 frames */ @@ -944,9 +942,8 @@ static void tcp_sock4_iov_init(const struct ctx *c) /* iovecs */ iov = tcp4_l2_iov[i]; - iov[TCP_IOV_TAP].iov_base = &tcp4_payload_vnet_len[i]; - iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? - sizeof(tcp4_payload_vnet_len[i]) : 0; + iov[TCP_IOV_TAP].iov_base = &vnet_len; + iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len); iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src); iov[TCP_IOV_IP].iov_base = &tcp4_payload_ip[i]; @@ -954,9 +951,8 @@ static void tcp_sock4_iov_init(const struct ctx *c) iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i]; iov = tcp4_l2_flags_iov[i]; - iov[TCP_IOV_TAP].iov_base = &tcp4_flags_vnet_len[i]; - iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? - sizeof(tcp4_flags_vnet_len[i]) : 0; + iov[TCP_IOV_TAP].iov_base = &vnet_len; + iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len); iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; iov[TCP_IOV_ETH].iov_len = sizeof(tcp4_eth_src); iov[TCP_IOV_IP].iov_base = &tcp4_flags_ip[i]; @@ -989,9 +985,8 @@ static void tcp_sock6_iov_init(const struct ctx *c) /* iovecs */ iov = tcp6_l2_iov[i]; - iov[TCP_IOV_TAP].iov_base = &tcp6_payload_vnet_len[i]; - iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? - sizeof(tcp6_payload_vnet_len[i]) : 0; + iov[TCP_IOV_TAP].iov_base = &vnet_len; + iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len); iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src); iov[TCP_IOV_IP].iov_base = &tcp6_payload_ip[i]; @@ -999,9 +994,8 @@ static void tcp_sock6_iov_init(const struct ctx *c) iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i]; iov = tcp6_l2_flags_iov[i]; - iov[TCP_IOV_TAP].iov_base = &tcp6_flags_vnet_len[i]; - iov[TCP_IOV_TAP].iov_len = c->mode == MODE_PASST ? - sizeof(tcp6_flags_vnet_len[i]) : 0; + iov[TCP_IOV_TAP].iov_base = &vnet_len; + iov[TCP_IOV_TAP].iov_len = sizeof(vnet_len); iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; iov[TCP_IOV_ETH].iov_len = sizeof(tcp6_eth_src); iov[TCP_IOV_IP].iov_base = &tcp6_flags_ip[i]; @@ -1558,7 +1552,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) struct tcp_info tinfo = { 0 }; socklen_t sl = sizeof(tinfo); int s = conn->sock; - uint32_t vnet_len; size_t optlen = 0; struct tcphdr *th; struct iovec *iov; @@ -1587,10 +1580,8 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) if (CONN_V4(conn)) { iov = tcp4_l2_flags_iov[tcp4_flags_used++]; - vnet_len = sizeof(struct ethhdr) + sizeof(struct iphdr); } else { iov = tcp6_l2_flags_iov[tcp6_flags_used++]; - vnet_len = sizeof(struct ethhdr) + sizeof(struct ipv6hdr); } payload = iov[TCP_IOV_PAYLOAD].iov_base; @@ -1649,8 +1640,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) conn->seq_to_tap); iov[TCP_IOV_PAYLOAD].iov_len = ip_len; - *(uint32_t *)iov[TCP_IOV_TAP].iov_base = htonl(vnet_len + ip_len); - if (th->ack) { if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) conn_flag(c, conn, ~ACK_TO_TAP_DUE); @@ -2150,10 +2139,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, check, seq); iov[TCP_IOV_PAYLOAD].iov_len = ip_len; - *(uint32_t *)iov[TCP_IOV_TAP].iov_base = - htonl(sizeof(struct ethhdr) + - sizeof(struct iphdr) + - ip_len); if (tcp4_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); } else if (CONN_V6(conn)) { @@ -2163,10 +2148,6 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, iov = tcp6_l2_iov[tcp6_payload_used++]; ip_len = tcp_l2_buf_fill_headers(c, conn, iov, plen, NULL, seq); iov[TCP_IOV_PAYLOAD].iov_len = ip_len; - *(uint32_t *)iov[TCP_IOV_TAP].iov_base = - htonl(sizeof(struct ethhdr) + - sizeof(struct ipv6hdr) + - ip_len); if (tcp6_payload_used > TCP_FRAMES_MEM - 1) tcp_payload_flush(c); }-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson