vhost-net is a kernel device that allows to read packets from a tap device using virtio queues instead of regular read(2) and write(2). This enables a more eficient packet processing, as the memory can be written directly by the kernel to the userspace and back, instead of wasting bandwith on copies, and it enables to batch many packets in a single notification (through eventfds) both tx and rx. --- RFC: At this moment only to receive packets from the vhost kernel to the pasta binary is supported. It does not even support to refill more rx descriptors so the kernel can keep receiving packets! Just for early review to make sure we're on the same page. Eugenio Pérez (3): tap: specify the packet pool tap: implement vhost_call_cb tap: add die() on vhost error epoll_type.h | 4 + passt.c | 8 ++ passt.h | 13 +- tap.c | 366 +++++++++++++++++++++++++++++++++++++++++++++++++-- tap.h | 6 +- vu_common.c | 7 +- 6 files changed, 387 insertions(+), 17 deletions(-) -- 2.49.0
In vhost-kernel we need to allocate a storage for rx so kernel can write the buffers async. We need to tell tap_add_packet where to find them. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com> --- tap.c | 34 ++++++++++++++++++++++++---------- tap.h | 4 ++-- vu_common.c | 7 ++++--- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/tap.c b/tap.c index 182a115..ce859ba 100644 --- a/tap.c +++ b/tap.c @@ -1031,10 +1031,16 @@ void tap_flush_pools(void) * @c: Execution context * @now: Current timestamp */ -void tap_handler(struct ctx *c, const struct timespec *now) +void tap_handler(struct ctx *c, const struct timespec *now, struct pool *pool4, + struct pool *pool6) { - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + if (!pool4) + pool4 = pool_tap4; + if (!pool6) + pool6 = pool_tap6; + + tap4_handler(c, pool4, now); + tap6_handler(c, pool6, now); } /** @@ -1042,11 +1048,19 @@ void tap_handler(struct ctx *c, const struct timespec *now) * @c: Execution context * @l2len: Total L2 packet length * @p: Packet buffer + * @pool4 Pool for tap ipv4 packets. If NULL, is pool_tap4 + * @pool6 Pool for tap ipv6 packets. If NULL, is pool_tap6 */ -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, + struct pool *pool4, struct pool *pool6) { const struct ethhdr *eh; + if (!pool4) + pool4 = pool_tap4; + if (!pool6) + pool6 = pool_tap6; + pcap(p, l2len); eh = (struct ethhdr *)p; @@ -1059,10 +1073,10 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) switch (ntohs(eh->h_proto)) { case ETH_P_ARP: case ETH_P_IP: - packet_add(pool_tap4, l2len, p); + packet_add(pool4, l2len, p); break; case ETH_P_IPV6: - packet_add(pool_tap6, l2len, p); + packet_add(pool6, l2len, p); break; default: break; @@ -1142,7 +1156,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) p += sizeof(uint32_t); n -= sizeof(uint32_t); - tap_add_packet(c, l2len, p); + tap_add_packet(c, l2len, p, pool_tap4, pool_tap6); p += l2len; n -= l2len; @@ -1151,7 +1165,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) partial_len = n; partial_frame = p; - tap_handler(c, now); + tap_handler(c, now, NULL, NULL); } /** @@ -1207,10 +1221,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) len > (ssize_t)L2_MAX_LEN_PASTA) continue; - tap_add_packet(c, len, pkt_buf + n); + tap_add_packet(c, len, pkt_buf + n, pool_tap4, pool_tap6); } - tap_handler(c, now); + tap_handler(c, now, NULL, NULL); } /** diff --git a/tap.h b/tap.h index dd39fd8..0b5ad17 100644 --- a/tap.h +++ b/tap.h @@ -118,7 +118,7 @@ void tap_sock_reset(struct ctx *c); void tap_sock_update_pool(void *base, size_t size); void tap_backend_init(struct ctx *c); void tap_flush_pools(void); -void tap_handler(struct ctx *c, const struct timespec *now); -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p); +void tap_handler(struct ctx *c, const struct timespec *now, struct pool *pool4, struct pool *pool6); +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, struct pool *pool4, struct pool *pool6); #endif /* TAP_H */ diff --git a/vu_common.c b/vu_common.c index 686a09b..4fe982f 100644 --- a/vu_common.c +++ b/vu_common.c @@ -191,7 +191,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, tap_add_packet(vdev->context, elem[count].out_sg[0].iov_len - hdrlen, (char *)elem[count].out_sg[0].iov_base + - hdrlen); + hdrlen, NULL, NULL); } else { /* vnet header can be in a separate iovec */ if (elem[count].out_num != 2) { @@ -203,13 +203,14 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, } else { tap_add_packet(vdev->context, elem[count].out_sg[1].iov_len, - (char *)elem[count].out_sg[1].iov_base); + (char *)elem[count].out_sg[1].iov_base, + NULL, NULL); } } count++; } - tap_handler(vdev->context, now); + tap_handler(vdev->context, now, NULL, NULL); if (count) { int i; -- 2.49.0
On Tue, Apr 01, 2025 at 01:38:07PM +0200, Eugenio Pérez wrote:In vhost-kernel we need to allocate a storage for rx so kernel can write the buffers async. We need to tell tap_add_packet where to find them. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com>As discussed on our call, I don't think we need this. We should be able to re-use pool_tap[46] for vhost-net.--- tap.c | 34 ++++++++++++++++++++++++---------- tap.h | 4 ++-- vu_common.c | 7 ++++--- 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/tap.c b/tap.c index 182a115..ce859ba 100644 --- a/tap.c +++ b/tap.c @@ -1031,10 +1031,16 @@ void tap_flush_pools(void) * @c: Execution context * @now: Current timestamp */ -void tap_handler(struct ctx *c, const struct timespec *now) +void tap_handler(struct ctx *c, const struct timespec *now, struct pool *pool4, + struct pool *pool6) { - tap4_handler(c, pool_tap4, now); - tap6_handler(c, pool_tap6, now); + if (!pool4) + pool4 = pool_tap4; + if (!pool6) + pool6 = pool_tap6; + + tap4_handler(c, pool4, now); + tap6_handler(c, pool6, now); } /** @@ -1042,11 +1048,19 @@ void tap_handler(struct ctx *c, const struct timespec *now) * @c: Execution context * @l2len: Total L2 packet length * @p: Packet buffer + * @pool4 Pool for tap ipv4 packets. If NULL, is pool_tap4 + * @pool6 Pool for tap ipv6 packets. If NULL, is pool_tap6 */ -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, + struct pool *pool4, struct pool *pool6) { const struct ethhdr *eh; + if (!pool4) + pool4 = pool_tap4; + if (!pool6) + pool6 = pool_tap6; + pcap(p, l2len); eh = (struct ethhdr *)p; @@ -1059,10 +1073,10 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) switch (ntohs(eh->h_proto)) { case ETH_P_ARP: case ETH_P_IP: - packet_add(pool_tap4, l2len, p); + packet_add(pool4, l2len, p); break; case ETH_P_IPV6: - packet_add(pool_tap6, l2len, p); + packet_add(pool6, l2len, p); break; default: break; @@ -1142,7 +1156,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) p += sizeof(uint32_t); n -= sizeof(uint32_t); - tap_add_packet(c, l2len, p); + tap_add_packet(c, l2len, p, pool_tap4, pool_tap6); p += l2len; n -= l2len; @@ -1151,7 +1165,7 @@ static void tap_passt_input(struct ctx *c, const struct timespec *now) partial_len = n; partial_frame = p; - tap_handler(c, now); + tap_handler(c, now, NULL, NULL); } /** @@ -1207,10 +1221,10 @@ static void tap_pasta_input(struct ctx *c, const struct timespec *now) len > (ssize_t)L2_MAX_LEN_PASTA) continue; - tap_add_packet(c, len, pkt_buf + n); + tap_add_packet(c, len, pkt_buf + n, pool_tap4, pool_tap6); } - tap_handler(c, now); + tap_handler(c, now, NULL, NULL); } /** diff --git a/tap.h b/tap.h index dd39fd8..0b5ad17 100644 --- a/tap.h +++ b/tap.h @@ -118,7 +118,7 @@ void tap_sock_reset(struct ctx *c); void tap_sock_update_pool(void *base, size_t size); void tap_backend_init(struct ctx *c); void tap_flush_pools(void); -void tap_handler(struct ctx *c, const struct timespec *now); -void tap_add_packet(struct ctx *c, ssize_t l2len, char *p); +void tap_handler(struct ctx *c, const struct timespec *now, struct pool *pool4, struct pool *pool6); +void tap_add_packet(struct ctx *c, ssize_t l2len, char *p, struct pool *pool4, struct pool *pool6); #endif /* TAP_H */ diff --git a/vu_common.c b/vu_common.c index 686a09b..4fe982f 100644 --- a/vu_common.c +++ b/vu_common.c @@ -191,7 +191,7 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, tap_add_packet(vdev->context, elem[count].out_sg[0].iov_len - hdrlen, (char *)elem[count].out_sg[0].iov_base + - hdrlen); + hdrlen, NULL, NULL); } else { /* vnet header can be in a separate iovec */ if (elem[count].out_num != 2) { @@ -203,13 +203,14 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, } else { tap_add_packet(vdev->context, elem[count].out_sg[1].iov_len, - (char *)elem[count].out_sg[1].iov_base); + (char *)elem[count].out_sg[1].iov_base, + NULL, NULL); } } count++; } - tap_handler(vdev->context, now); + tap_handler(vdev->context, now, NULL, NULL); if (count) { int i;-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On Thu, Apr 3, 2025 at 3:51 AM David Gibson <david(a)gibson.dropbear.id.au> wrote:On Tue, Apr 01, 2025 at 01:38:07PM +0200, Eugenio Pérez wrote:Right, I'm able to transmit from the guest reusing pool_tap[46] now, thanks!In vhost-kernel we need to allocate a storage for rx so kernel can write the buffers async. We need to tell tap_add_packet where to find them. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com>As discussed on our call, I don't think we need this. We should be able to re-use pool_tap[46] for vhost-net.
This is the main callback when the tap device has processed any buffer. Another possibility is to reuse the tap callback for this, so less code changes are needed. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com> --- epoll_type.h | 2 + passt.c | 4 + passt.h | 12 +- tap.c | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++- tap.h | 2 + 5 files changed, 334 insertions(+), 2 deletions(-) diff --git a/epoll_type.h b/epoll_type.h index 7f2a121..6284c79 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -44,6 +44,8 @@ enum epoll_type { EPOLL_TYPE_REPAIR_LISTEN, /* TCP_REPAIR helper socket */ EPOLL_TYPE_REPAIR, + /* vhost-kernel call socket */ + EPOLL_TYPE_VHOST_CALL, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index cd06772..19c5d5f 100644 --- a/passt.c +++ b/passt.c @@ -79,6 +79,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", + [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -357,6 +358,9 @@ loop: case EPOLL_TYPE_REPAIR: repair_handler(&c, eventmask); break; + case EPOLL_TYPE_VHOST_CALL: + vhost_call_cb(&c, ref, &now); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index 8f45091..eb5aa03 100644 --- a/passt.h +++ b/passt.h @@ -45,7 +45,7 @@ union epoll_ref; * @icmp: ICMP-specific reference part * @data: Data handled by protocol handlers * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone - * @queue: vhost-user queue index for this fd + * @queue: vhost queue index for this fd * @u64: Opaque reference for epoll_ctl() and epoll_wait() */ union epoll_ref { @@ -271,11 +271,14 @@ struct ctx { int fd_tap; int fd_repair_listen; int fd_repair; + /* TODO document all added fields */ + int fd_vhost; unsigned char our_tap_mac[ETH_ALEN]; unsigned char guest_mac[ETH_ALEN]; uint16_t mtu; uint64_t hash_secret[2]; + uint64_t virtio_features; int ifi4; struct ip4_ctx ip4; @@ -299,6 +302,13 @@ struct ctx { int no_icmp; struct icmp_ctx icmp; + struct { + uint16_t last_used_idx; + + int kick_fd; + int call_fd; + } vq[2]; + int no_dns; int no_dns_search; int no_dhcp_dns; diff --git a/tap.c b/tap.c index ce859ba..fbe83aa 100644 --- a/tap.c +++ b/tap.c @@ -31,6 +31,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> +#include <sys/eventfd.h> #include <sys/uio.h> #include <stdbool.h> #include <stdlib.h> @@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ +#define VHOST_VIRTIO 0xAF +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE))); +static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf); +static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf); + +/* TODO is it better to have 1024 or 65520 bytes per packet? */ +#define VHOST_NDESCS 1024 +static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE))); +static union { + struct vring_avail avail; + char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])]; +} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE))); +static union { + struct vring_used used; + char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])]; +} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE))); + +/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */ +#define N_VHOST_REGIONS 7 +union { + struct vhost_memory mem; + char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])]; +} vhost_memory = { + .mem = { + .nregions = N_VHOST_REGIONS, + }, +}; + /** * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode * @c: Execution context @@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events) tap_start_connection(c); } +static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len) +{ + struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used; + uint32_t i; + uint16_t used_idx, last_used; + + /* TODO think if this has races with previous eventfd_read */ + /* TODO we could improve performance with a shadow_used_idx */ + used_idx = le16toh(used->idx); + + smp_rmb(); + + if (used_idx == c->vq[qid].last_used_idx) { + *len = 0; + return NULL; + } + + last_used = c->vq[qid].last_used_idx & VHOST_NDESCS; + i = le32toh(used->ring[last_used].id); + *len = le32toh(used->ring[last_used].len); + + if (i > VHOST_NDESCS) { + /* TODO imporove this, it will cause infinite loop */ + warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS); + return NULL; + } + + if (*len > PKT_BUF_BYTES/VHOST_NDESCS) { + warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS); + return NULL; + } + + /* TODO check if the id is valid and it has not been double used */ + c->vq[qid].last_used_idx++; + return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); +} + +/* container is tx but we receive it from vhost POV */ +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now) +{ + eventfd_read(ref.fd, (eventfd_t[]){ 0 }); + + tap_flush_pools(); + + while (true) { + struct virtio_net_hdr_mrg_rxbuf *hdr; + unsigned len; + + hdr = virtqueue_get_buf(c, ref.queue, &len); + if (!hdr) + break; + + if (len < sizeof(*hdr)) { + warn("vhost: invalid len %u", len); + continue; + } + + /* TODO this will break from this moment */ + if (hdr->num_buffers != 1) { + warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS); + continue; + } + + /* TODO fix the v6 pool to support ipv6 */ + tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6); + } + + tap_handler(c, now, pool_virtiorx4, pool_virtiorx6); +} + +/* TODO: Actually refill */ +static void rx_pkt_refill(int kick_fd) +{ + for (unsigned i = 0; i < VHOST_NDESCS; ++i) { + vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); + vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS; + vring_desc[0][i].flags = VRING_DESC_F_WRITE; + } + + vring_avail_0.avail.idx = VHOST_NDESCS; + eventfd_write(kick_fd, 1); +} + /** * tap_ns_tun() - Get tuntap fd in namespace * @c: Execution context @@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events) */ static int tap_ns_tun(void *arg) { + /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */ + static const uint64_t features = + (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR); struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI }; int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC; struct ctx *c = (struct ctx *)arg; - int fd, rc; + int fd, vhost_fd, rc; c->fd_tap = -1; memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ); @@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg) if (fd < 0) die_perror("Failed to open() /dev/net/tun"); + vhost_fd = open("/dev/vhost-net", flags); + if (vhost_fd < 0) + die_perror("Failed to open() /dev/vhost-net"); + + rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL); + if (rc < 0) + die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed"); + + rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed"); + + /* TODO inform more explicitely */ + fprintf(stderr, "vhost features: %lx\n", c->virtio_features); + fprintf(stderr, "req features: %lx\n", features); + c->virtio_features &= features; + if (c->virtio_features != features) + die("vhost does not support required features"); + + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_file file = { + .index = i, + }; + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL, + .queue = i }; + struct epoll_event ev; + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + ref.fd = file.fd; + if (file.fd < 0) + die_perror("Failed to create call eventfd"); + + rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed"); + + ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN }; + rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); + if (rc < 0) + die_perror("Failed to add call eventfd to epoll"); + c->vq[i].call_fd = file.fd; + } + + /* 1:1 translation */ + vhost_memory.mem.regions[0] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_desc, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_desc), + .userspace_addr = (uintptr_t)&vring_desc, + }; + vhost_memory.mem.regions[1] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_avail_0, + }; + vhost_memory.mem.regions[2] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_avail_1, + }; + vhost_memory.mem.regions[3] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_used_0, + }; + vhost_memory.mem.regions[4] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_used_1, + }; + vhost_memory.mem.regions[5] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(virtio_rx_pkt_buf), + .userspace_addr = (uintptr_t)virtio_rx_pkt_buf, + }; + vhost_memory.mem.regions[6] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(pkt_buf), + .userspace_addr = (uintptr_t)pkt_buf, + }; + static_assert(6 < N_VHOST_REGIONS); + + rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem); + if (rc < 0) + die_perror( + "VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed"); + + /* TODO: probably it increases RX perf */ +#if 0 + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + + if (ioctl(fd, TUNGETIFF, &ifr) != 0) + die_perror("Unable to query TUNGETIFF on FD %d", fd); + } + + if (ifr.ifr_flags & IFF_VNET_HDR) + net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF); +#endif + rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed"); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_addr addr = { + .index = i, + .desc_user_addr = (unsigned long)vring_desc[i], + .avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 : + (unsigned long)&vring_avail_1, + .used_user_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + /* GPA addr */ + .log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + }; + struct vhost_vring_state state = { + .index = i, + .num = VHOST_NDESCS, + }; + struct vhost_vring_file file = { + .index = i, + }; + + rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed"); + + fprintf(stderr, "qid: %d\n", i); + fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr); + fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr); + fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr); + rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed"); + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (file.fd < 0) + die_perror("Failed to create kick eventfd"); + rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed"); + c->vq[i].kick_fd = file.fd; + } + rc = ioctl(fd, (int)TUNSETIFF, &ifr); if (rc < 0) die_perror("TUNSETIFF ioctl on /dev/net/tun failed"); @@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg) if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn))) die("Tap device opened but no network interface found"); + rx_pkt_refill(c->vq[0].kick_fd); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_file file = { + .index = i, + .fd = fd, + }; + + fprintf(stderr, "qid: %d\n", file.index); + fprintf(stderr, "tap fd: %d\n", file.fd); + rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file); + if (rc < 0) + die_perror( + "VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed"); + } + c->fd_tap = fd; + c->fd_vhost = vhost_fd; return 0; } diff --git a/tap.h b/tap.h index 0b5ad17..91d3f62 100644 --- a/tap.h +++ b/tap.h @@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now); + unsigned long tap_l2_max_len(const struct ctx *c); void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, -- 2.49.0
A couple of general notes: - there's no need to Cc: people already on this list unless you need their attention specifically (it can get quite noisy...) - things kind of make sense to me, many are hard to evaluate at this early stage, so I noted below just some specific comments/questions here, but in the sense of "being on the same page" my current answer is... yes, I guess so! - I'm not reviewing 1/3 and 3/3 right away as I guess you'll revisit them anyway, I'm just not sure we need a separate pool... but I'm not sure if that's temporary either. On Tue, 1 Apr 2025 13:38:08 +0200 Eugenio Pérez <eperezma(a)redhat.com> wrote:This is the main callback when the tap device has processed any buffer. Another possibility is to reuse the tap callback for this, so less code changes are needed. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com> --- epoll_type.h | 2 + passt.c | 4 + passt.h | 12 +- tap.c | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++- tap.h | 2 + 5 files changed, 334 insertions(+), 2 deletions(-) diff --git a/epoll_type.h b/epoll_type.h index 7f2a121..6284c79 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -44,6 +44,8 @@ enum epoll_type { EPOLL_TYPE_REPAIR_LISTEN, /* TCP_REPAIR helper socket */ EPOLL_TYPE_REPAIR, + /* vhost-kernel call socket */ + EPOLL_TYPE_VHOST_CALL, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index cd06772..19c5d5f 100644 --- a/passt.c +++ b/passt.c @@ -79,6 +79,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", + [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -357,6 +358,9 @@ loop: case EPOLL_TYPE_REPAIR: repair_handler(&c, eventmask); break; + case EPOLL_TYPE_VHOST_CALL: + vhost_call_cb(&c, ref, &now); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index 8f45091..eb5aa03 100644 --- a/passt.h +++ b/passt.h @@ -45,7 +45,7 @@ union epoll_ref; * @icmp: ICMP-specific reference part * @data: Data handled by protocol handlers * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone - * @queue: vhost-user queue index for this fd + * @queue: vhost queue index for this fd * @u64: Opaque reference for epoll_ctl() and epoll_wait() */ union epoll_ref { @@ -271,11 +271,14 @@ struct ctx { int fd_tap; int fd_repair_listen; int fd_repair; + /* TODO document all added fields */ + int fd_vhost; unsigned char our_tap_mac[ETH_ALEN]; unsigned char guest_mac[ETH_ALEN]; uint16_t mtu; uint64_t hash_secret[2]; + uint64_t virtio_features; int ifi4; struct ip4_ctx ip4; @@ -299,6 +302,13 @@ struct ctx { int no_icmp; struct icmp_ctx icmp; + struct { + uint16_t last_used_idx; + + int kick_fd; + int call_fd; + } vq[2]; + int no_dns; int no_dns_search; int no_dhcp_dns; diff --git a/tap.c b/tap.c index ce859ba..fbe83aa 100644 --- a/tap.c +++ b/tap.c @@ -31,6 +31,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> +#include <sys/eventfd.h>Why do we need eventfds here? Is there anything peculiar in the protocol, or it's all stuff that can be done with "regular" file descriptors plus epoll?#include <sys/uio.h> #include <stdbool.h> #include <stdlib.h> @@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ +#define VHOST_VIRTIO 0xAF +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)Coding style (no strict requirement): align those nicely/table-like if possible.+ +char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE))); +static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf); +static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf); + +/* TODO is it better to have 1024 or 65520 bytes per packet? */In general 65535 bytes (including the Ethernet header) appears to be a good idea, but actual profiling would be nice in the long term.+#define VHOST_NDESCS 1024 +static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE)));Coding style, here and a bit all over the place: wrap to 80 columns ("net" kernel-like style).+static union { + struct vring_avail avail; + char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])]; +} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE))); +static union { + struct vring_used used; + char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])]; +} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE))); + +/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */What's the "all descs ring"? A short "theory of operation" section might help eventually.+#define N_VHOST_REGIONS 7 +union { + struct vhost_memory mem; + char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])]; +} vhost_memory = { + .mem = { + .nregions = N_VHOST_REGIONS, + }, +}; + /** * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode * @c: Execution context @@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events) tap_start_connection(c); } +static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len) +{ + struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used; + uint32_t i; + uint16_t used_idx, last_used; + + /* TODO think if this has races with previous eventfd_read */ + /* TODO we could improve performance with a shadow_used_idx */ + used_idx = le16toh(used->idx); + + smp_rmb(); + + if (used_idx == c->vq[qid].last_used_idx) { + *len = 0; + return NULL; + } + + last_used = c->vq[qid].last_used_idx & VHOST_NDESCS; + i = le32toh(used->ring[last_used].id); + *len = le32toh(used->ring[last_used].len); + + if (i > VHOST_NDESCS) { + /* TODO imporove this, it will cause infinite loop */ + warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS); + return NULL; + } + + if (*len > PKT_BUF_BYTES/VHOST_NDESCS) { + warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS); + return NULL; + } + + /* TODO check if the id is valid and it has not been double used */ + c->vq[qid].last_used_idx++; + return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); +} + +/* container is tx but we receive it from vhost POV */ +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now) +{ + eventfd_read(ref.fd, (eventfd_t[]){ 0 }); + + tap_flush_pools(); + + while (true) { + struct virtio_net_hdr_mrg_rxbuf *hdr; + unsigned len; + + hdr = virtqueue_get_buf(c, ref.queue, &len); + if (!hdr) + break; + + if (len < sizeof(*hdr)) { + warn("vhost: invalid len %u", len); + continue; + } + + /* TODO this will break from this moment */ + if (hdr->num_buffers != 1) { + warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS); + continue; + } + + /* TODO fix the v6 pool to support ipv6 */ + tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6); + } + + tap_handler(c, now, pool_virtiorx4, pool_virtiorx6); +} + +/* TODO: Actually refill */ +static void rx_pkt_refill(int kick_fd) +{ + for (unsigned i = 0; i < VHOST_NDESCS; ++i) { + vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); + vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS; + vring_desc[0][i].flags = VRING_DESC_F_WRITE; + } + + vring_avail_0.avail.idx = VHOST_NDESCS; + eventfd_write(kick_fd, 1); +} + /** * tap_ns_tun() - Get tuntap fd in namespace * @c: Execution context @@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events) */ static int tap_ns_tun(void *arg) { + /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */ + static const uint64_t features = + (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR); struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };I kind of wonder, by the way, if IFF_TUN simplifies things here. It's something we should already add, as an option, see also: https://bugs.passt.top/show_bug.cgi?id=49, but if it makes your life easier for any reason you might consider adding it right away.int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC; struct ctx *c = (struct ctx *)arg; - int fd, rc; + int fd, vhost_fd, rc; c->fd_tap = -1; memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ); @@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg) if (fd < 0) die_perror("Failed to open() /dev/net/tun"); + vhost_fd = open("/dev/vhost-net", flags); + if (vhost_fd < 0) + die_perror("Failed to open() /dev/vhost-net");Note pretty much to future self: this will need adjustments to AppArmor and SELinux policies.+ + rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL); + if (rc < 0) + die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed"); + + rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed"); + + /* TODO inform more explicitely */ + fprintf(stderr, "vhost features: %lx\n", c->virtio_features); + fprintf(stderr, "req features: %lx\n", features); + c->virtio_features &= features; + if (c->virtio_features != features) + die("vhost does not support required features"); + + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {No declarations directly in loops (it hides them somehow).+ struct vhost_vring_file file = { + .index = i, + }; + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL, + .queue = i }; + struct epoll_event ev; + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + ref.fd = file.fd; + if (file.fd < 0) + die_perror("Failed to create call eventfd"); + + rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed");Same as net kernel style: if it's more than a line, even as a single statement, use curly brackets (rationale: somebody later adds another statement without noticing and... oops).+ + ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN }; + rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); + if (rc < 0) + die_perror("Failed to add call eventfd to epoll"); + c->vq[i].call_fd = file.fd; + } + + /* 1:1 translation */ + vhost_memory.mem.regions[0] = (struct vhost_memory_region){Space between cast and initialiser, ") {", for consistency. I'll wait before we have some kind of theory of operation / general description before actually looking into those, I'm not sure about the exact role of those seven regions.+ .guest_phys_addr = (uintptr_t)&vring_desc, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_desc), + .userspace_addr = (uintptr_t)&vring_desc, + }; + vhost_memory.mem.regions[1] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_avail_0, + }; + vhost_memory.mem.regions[2] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_avail_1, + }; + vhost_memory.mem.regions[3] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_used_0, + }; + vhost_memory.mem.regions[4] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_used_1, + }; + vhost_memory.mem.regions[5] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(virtio_rx_pkt_buf), + .userspace_addr = (uintptr_t)virtio_rx_pkt_buf, + }; + vhost_memory.mem.regions[6] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(pkt_buf), + .userspace_addr = (uintptr_t)pkt_buf, + }; + static_assert(6 < N_VHOST_REGIONS); + + rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem); + if (rc < 0) + die_perror( + "VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed"); + + /* TODO: probably it increases RX perf */ +#if 0 + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + + if (ioctl(fd, TUNGETIFF, &ifr) != 0) + die_perror("Unable to query TUNGETIFF on FD %d", fd); + } + + if (ifr.ifr_flags & IFF_VNET_HDR) + net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF); +#endif + rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed"); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_addr addr = { + .index = i, + .desc_user_addr = (unsigned long)vring_desc[i], + .avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 : + (unsigned long)&vring_avail_1, + .used_user_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + /* GPA addr */ + .log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + }; + struct vhost_vring_state state = { + .index = i, + .num = VHOST_NDESCS, + }; + struct vhost_vring_file file = { + .index = i, + }; + + rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed"); + + fprintf(stderr, "qid: %d\n", i); + fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr); + fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr); + fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr); + rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed"); + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (file.fd < 0) + die_perror("Failed to create kick eventfd"); + rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed"); + c->vq[i].kick_fd = file.fd; + } + rc = ioctl(fd, (int)TUNSETIFF, &ifr); if (rc < 0) die_perror("TUNSETIFF ioctl on /dev/net/tun failed"); @@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg) if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn))) die("Tap device opened but no network interface found"); + rx_pkt_refill(c->vq[0].kick_fd); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_file file = { + .index = i, + .fd = fd, + }; + + fprintf(stderr, "qid: %d\n", file.index); + fprintf(stderr, "tap fd: %d\n", file.fd); + rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file); + if (rc < 0) + die_perror( + "VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed"); + } + c->fd_tap = fd; + c->fd_vhost = vhost_fd; return 0; } diff --git a/tap.h b/tap.h index 0b5ad17..91d3f62 100644 --- a/tap.h +++ b/tap.h @@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now); + unsigned long tap_l2_max_len(const struct ctx *c); void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,-- Stefano
On Wed, Apr 2, 2025 at 9:16 AM Stefano Brivio <sbrivio(a)redhat.com> wrote:A couple of general notes: - there's no need to Cc: people already on this list unless you need their attention specifically (it can get quite noisy...)Got it, I'll do for the next time.- things kind of make sense to me, many are hard to evaluate at this early stage, so I noted below just some specific comments/questions here, but in the sense of "being on the same page" my current answer is... yes, I guess so! - I'm not reviewing 1/3 and 3/3 right away as I guess you'll revisit them anyway, I'm just not sure we need a separate pool... but I'm not sure if that's temporary either. On Tue, 1 Apr 2025 13:38:08 +0200 Eugenio Pérez <eperezma(a)redhat.com> wrote:Yes, vhost-net trusts eventfd for signalling.This is the main callback when the tap device has processed any buffer. Another possibility is to reuse the tap callback for this, so less code changes are needed. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com> --- epoll_type.h | 2 + passt.c | 4 + passt.h | 12 +- tap.c | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++- tap.h | 2 + 5 files changed, 334 insertions(+), 2 deletions(-) diff --git a/epoll_type.h b/epoll_type.h index 7f2a121..6284c79 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -44,6 +44,8 @@ enum epoll_type { EPOLL_TYPE_REPAIR_LISTEN, /* TCP_REPAIR helper socket */ EPOLL_TYPE_REPAIR, + /* vhost-kernel call socket */ + EPOLL_TYPE_VHOST_CALL, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index cd06772..19c5d5f 100644 --- a/passt.c +++ b/passt.c @@ -79,6 +79,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", + [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -357,6 +358,9 @@ loop: case EPOLL_TYPE_REPAIR: repair_handler(&c, eventmask); break; + case EPOLL_TYPE_VHOST_CALL: + vhost_call_cb(&c, ref, &now); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index 8f45091..eb5aa03 100644 --- a/passt.h +++ b/passt.h @@ -45,7 +45,7 @@ union epoll_ref; * @icmp: ICMP-specific reference part * @data: Data handled by protocol handlers * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone - * @queue: vhost-user queue index for this fd + * @queue: vhost queue index for this fd * @u64: Opaque reference for epoll_ctl() and epoll_wait() */ union epoll_ref { @@ -271,11 +271,14 @@ struct ctx { int fd_tap; int fd_repair_listen; int fd_repair; + /* TODO document all added fields */ + int fd_vhost; unsigned char our_tap_mac[ETH_ALEN]; unsigned char guest_mac[ETH_ALEN]; uint16_t mtu; uint64_t hash_secret[2]; + uint64_t virtio_features; int ifi4; struct ip4_ctx ip4; @@ -299,6 +302,13 @@ struct ctx { int no_icmp; struct icmp_ctx icmp; + struct { + uint16_t last_used_idx; + + int kick_fd; + int call_fd; + } vq[2]; + int no_dns; int no_dns_search; int no_dhcp_dns; diff --git a/tap.c b/tap.c index ce859ba..fbe83aa 100644 --- a/tap.c +++ b/tap.c @@ -31,6 +31,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> +#include <sys/eventfd.h>Why do we need eventfds here? Is there anything peculiar in the protocol, or it's all stuff that can be done with "regular" file descriptors plus epoll?I'll fix it (and the rest of code style comments) in the next RFC!#include <sys/uio.h> #include <stdbool.h> #include <stdlib.h> @@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ +#define VHOST_VIRTIO 0xAF +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)Coding style (no strict requirement): align those nicely/table-like if possible.That can be done for sure.+ +char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE))); +static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf); +static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf); + +/* TODO is it better to have 1024 or 65520 bytes per packet? */In general 65535 bytes (including the Ethernet header) appears to be a good idea, but actual profiling would be nice in the long term.The vhost kernel module expects these descriptors ring for the buffer addresses & length. I can add more docs for sure.+#define VHOST_NDESCS 1024 +static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE)));Coding style, here and a bit all over the place: wrap to 80 columns ("net" kernel-like style).+static union { + struct vring_avail avail; + char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])]; +} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE))); +static union { + struct vring_used used; + char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])]; +} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE))); + +/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */What's the "all descs ring"? A short "theory of operation" section might help eventually.I'm not sure, but maybe we can reuse something, yes!+#define N_VHOST_REGIONS 7 +union { + struct vhost_memory mem; + char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])]; +} vhost_memory = { + .mem = { + .nregions = N_VHOST_REGIONS, + }, +}; + /** * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode * @c: Execution context @@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events) tap_start_connection(c); } +static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len) +{ + struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used; + uint32_t i; + uint16_t used_idx, last_used; + + /* TODO think if this has races with previous eventfd_read */ + /* TODO we could improve performance with a shadow_used_idx */ + used_idx = le16toh(used->idx); + + smp_rmb(); + + if (used_idx == c->vq[qid].last_used_idx) { + *len = 0; + return NULL; + } + + last_used = c->vq[qid].last_used_idx & VHOST_NDESCS; + i = le32toh(used->ring[last_used].id); + *len = le32toh(used->ring[last_used].len); + + if (i > VHOST_NDESCS) { + /* TODO imporove this, it will cause infinite loop */ + warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS); + return NULL; + } + + if (*len > PKT_BUF_BYTES/VHOST_NDESCS) { + warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS); + return NULL; + } + + /* TODO check if the id is valid and it has not been double used */ + c->vq[qid].last_used_idx++; + return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); +} + +/* container is tx but we receive it from vhost POV */ +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now) +{ + eventfd_read(ref.fd, (eventfd_t[]){ 0 }); + + tap_flush_pools(); + + while (true) { + struct virtio_net_hdr_mrg_rxbuf *hdr; + unsigned len; + + hdr = virtqueue_get_buf(c, ref.queue, &len); + if (!hdr) + break; + + if (len < sizeof(*hdr)) { + warn("vhost: invalid len %u", len); + continue; + } + + /* TODO this will break from this moment */ + if (hdr->num_buffers != 1) { + warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS); + continue; + } + + /* TODO fix the v6 pool to support ipv6 */ + tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6); + } + + tap_handler(c, now, pool_virtiorx4, pool_virtiorx6); +} + +/* TODO: Actually refill */ +static void rx_pkt_refill(int kick_fd) +{ + for (unsigned i = 0; i < VHOST_NDESCS; ++i) { + vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); + vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS; + vring_desc[0][i].flags = VRING_DESC_F_WRITE; + } + + vring_avail_0.avail.idx = VHOST_NDESCS; + eventfd_write(kick_fd, 1); +} + /** * tap_ns_tun() - Get tuntap fd in namespace * @c: Execution context @@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events) */ static int tap_ns_tun(void *arg) { + /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */ + static const uint64_t features = + (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR); struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };I kind of wonder, by the way, if IFF_TUN simplifies things here. It's something we should already add, as an option, see also: https://bugs.passt.top/show_bug.cgi?id=49, but if it makes your life easier for any reason you might consider adding it right away.In the case of QEMU, vhost reads the descriptor ring with addresses in the guest address space, as it is a direct communication between them without QEMU intervening. These regions tells vhost how to translate these addresses to addresses in the QEMU address space so vhost can use copy_to_user and copy_from_user.int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC; struct ctx *c = (struct ctx *)arg; - int fd, rc; + int fd, vhost_fd, rc; c->fd_tap = -1; memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ); @@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg) if (fd < 0) die_perror("Failed to open() /dev/net/tun"); + vhost_fd = open("/dev/vhost-net", flags); + if (vhost_fd < 0) + die_perror("Failed to open() /dev/vhost-net");Note pretty much to future self: this will need adjustments to AppArmor and SELinux policies.+ + rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL); + if (rc < 0) + die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed"); + + rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed"); + + /* TODO inform more explicitely */ + fprintf(stderr, "vhost features: %lx\n", c->virtio_features); + fprintf(stderr, "req features: %lx\n", features); + c->virtio_features &= features; + if (c->virtio_features != features) + die("vhost does not support required features"); + + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {No declarations directly in loops (it hides them somehow).+ struct vhost_vring_file file = { + .index = i, + }; + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL, + .queue = i }; + struct epoll_event ev; + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + ref.fd = file.fd; + if (file.fd < 0) + die_perror("Failed to create call eventfd"); + + rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed");Same as net kernel style: if it's more than a line, even as a single statement, use curly brackets (rationale: somebody later adds another statement without noticing and... oops).+ + ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN }; + rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); + if (rc < 0) + die_perror("Failed to add call eventfd to epoll"); + c->vq[i].call_fd = file.fd; + } + + /* 1:1 translation */ + vhost_memory.mem.regions[0] = (struct vhost_memory_region){Space between cast and initialiser, ") {", for consistency. I'll wait before we have some kind of theory of operation / general description before actually looking into those, I'm not sure about the exact role of those seven regions.+ .guest_phys_addr = (uintptr_t)&vring_desc, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_desc), + .userspace_addr = (uintptr_t)&vring_desc, + }; + vhost_memory.mem.regions[1] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_avail_0, + }; + vhost_memory.mem.regions[2] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_avail_1, + }; + vhost_memory.mem.regions[3] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_used_0, + }; + vhost_memory.mem.regions[4] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_used_1, + }; + vhost_memory.mem.regions[5] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(virtio_rx_pkt_buf), + .userspace_addr = (uintptr_t)virtio_rx_pkt_buf, + }; + vhost_memory.mem.regions[6] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(pkt_buf), + .userspace_addr = (uintptr_t)pkt_buf, + }; + static_assert(6 < N_VHOST_REGIONS); + + rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem); + if (rc < 0) + die_perror( + "VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed"); + + /* TODO: probably it increases RX perf */ +#if 0 + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + + if (ioctl(fd, TUNGETIFF, &ifr) != 0) + die_perror("Unable to query TUNGETIFF on FD %d", fd); + } + + if (ifr.ifr_flags & IFF_VNET_HDR) + net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF); +#endif + rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed"); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_addr addr = { + .index = i, + .desc_user_addr = (unsigned long)vring_desc[i], + .avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 : + (unsigned long)&vring_avail_1, + .used_user_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + /* GPA addr */ + .log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + }; + struct vhost_vring_state state = { + .index = i, + .num = VHOST_NDESCS, + }; + struct vhost_vring_file file = { + .index = i, + }; + + rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed"); + + fprintf(stderr, "qid: %d\n", i); + fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr); + fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr); + fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr); + rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed"); + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (file.fd < 0) + die_perror("Failed to create kick eventfd"); + rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed"); + c->vq[i].kick_fd = file.fd; + } + rc = ioctl(fd, (int)TUNSETIFF, &ifr); if (rc < 0) die_perror("TUNSETIFF ioctl on /dev/net/tun failed"); @@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg) if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn))) die("Tap device opened but no network interface found"); + rx_pkt_refill(c->vq[0].kick_fd); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_file file = { + .index = i, + .fd = fd, + }; + + fprintf(stderr, "qid: %d\n", file.index); + fprintf(stderr, "tap fd: %d\n", file.fd); + rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file); + if (rc < 0) + die_perror( + "VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed"); + } + c->fd_tap = fd; + c->fd_vhost = vhost_fd; return 0; } diff --git a/tap.h b/tap.h index 0b5ad17..91d3f62 100644 --- a/tap.h +++ b/tap.h @@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now); + unsigned long tap_l2_max_len(const struct ctx *c); void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
On Wed, Apr 02, 2025 at 09:16:22AM +0200, Stefano Brivio wrote: [snip]I agree that supporting IFF_TUN would be a good idea in general. I think trying to do it in the middle of this will be unnecessary difficulty though. There are a bunch of places throughout the code that assume we have Ethernet headers, and changing that will require a pretty wide ranging rework. -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibsonstatic int tap_ns_tun(void *arg) { + /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */ + static const uint64_t features = + (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR); struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };I kind of wonder, by the way, if IFF_TUN simplifies things here. It's something we should already add, as an option, see also: https://bugs.passt.top/show_bug.cgi?id=49, but if it makes your life easier for any reason you might consider adding it right away.
On Thu, 3 Apr 2025 12:48:42 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Wed, Apr 02, 2025 at 09:16:22AM +0200, Stefano Brivio wrote: [snip]Maybe, yes, hence "if it makes your life easier for any reason". If it does not, it's of course out of scope here... -- StefanoI agree that supporting IFF_TUN would be a good idea in general. I think trying to do it in the middle of this will be unnecessary difficulty though. There are a bunch of places throughout the code that assume we have Ethernet headers, and changing that will require a pretty wide ranging rework.static int tap_ns_tun(void *arg) { + /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */ + static const uint64_t features = + (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR); struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };I kind of wonder, by the way, if IFF_TUN simplifies things here. It's something we should already add, as an option, see also: https://bugs.passt.top/show_bug.cgi?id=49, but if it makes your life easier for any reason you might consider adding it right away.
On Tue, Apr 01, 2025 at 01:38:08PM +0200, Eugenio Pérez wrote:This is the main callback when the tap device has processed any buffer. Another possibility is to reuse the tap callback for this, so less code changes are needed.I think using a new epoll_type and a new callback is the right approach here. We mostly try to avoid having multi-layered multiplexing of things.Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com> --- epoll_type.h | 2 + passt.c | 4 + passt.h | 12 +- tap.c | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++- tap.h | 2 + 5 files changed, 334 insertions(+), 2 deletions(-) diff --git a/epoll_type.h b/epoll_type.h index 7f2a121..6284c79 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -44,6 +44,8 @@ enum epoll_type { EPOLL_TYPE_REPAIR_LISTEN, /* TCP_REPAIR helper socket */ EPOLL_TYPE_REPAIR, + /* vhost-kernel call socket */ + EPOLL_TYPE_VHOST_CALL, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index cd06772..19c5d5f 100644 --- a/passt.c +++ b/passt.c @@ -79,6 +79,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", + [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -357,6 +358,9 @@ loop: case EPOLL_TYPE_REPAIR: repair_handler(&c, eventmask); break; + case EPOLL_TYPE_VHOST_CALL: + vhost_call_cb(&c, ref, &now); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index 8f45091..eb5aa03 100644 --- a/passt.h +++ b/passt.h @@ -45,7 +45,7 @@ union epoll_ref; * @icmp: ICMP-specific reference part * @data: Data handled by protocol handlers * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone - * @queue: vhost-user queue index for this fd + * @queue: vhost queue index for this fd * @u64: Opaque reference for epoll_ctl() and epoll_wait() */ union epoll_ref { @@ -271,11 +271,14 @@ struct ctx { int fd_tap; int fd_repair_listen; int fd_repair; + /* TODO document all added fields */ + int fd_vhost; unsigned char our_tap_mac[ETH_ALEN]; unsigned char guest_mac[ETH_ALEN]; uint16_t mtu; uint64_t hash_secret[2]; + uint64_t virtio_features; int ifi4; struct ip4_ctx ip4; @@ -299,6 +302,13 @@ struct ctx { int no_icmp; struct icmp_ctx icmp; + struct { + uint16_t last_used_idx; + + int kick_fd; + int call_fd; + } vq[2];It might be possible to share some of these with vhost-user as well, but that's a detail.int no_dns; int no_dns_search; int no_dhcp_dns; diff --git a/tap.c b/tap.c index ce859ba..fbe83aa 100644 --- a/tap.c +++ b/tap.c @@ -31,6 +31,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> +#include <sys/eventfd.h> #include <sys/uio.h> #include <stdbool.h> #include <stdlib.h> @@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ +#define VHOST_VIRTIO 0xAF +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)I think there's enough complexity in the vhost stuff that it's probably worth putting it into new files rather than adding to tap.c+char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE)));Again as discussed on the call, I think we can just re-use the existing pkt_buf for this.+static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf); +static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf); + +/* TODO is it better to have 1024 or 65520 bytes per packet? */I'm a little confused by this comment. We certainly want to allow 64k packets/frames, but the define below seems like it's a number of descriptors not a size of frame.+#define VHOST_NDESCS 1024 +static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE))); +static union { + struct vring_avail avail; + char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])]; +} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE))); +static union { + struct vring_used used; + char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])]; +} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE)));Some comments describing what these unions do would be helpful.+/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */ +#define N_VHOST_REGIONS 7 +union { + struct vhost_memory mem; + char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])]; +} vhost_memory = { + .mem = { + .nregions = N_VHOST_REGIONS, + }, +}; + /** * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode * @c: Execution context @@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events) tap_start_connection(c); } +static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len) +{ + struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used; + uint32_t i; + uint16_t used_idx, last_used; + + /* TODO think if this has races with previous eventfd_read */ + /* TODO we could improve performance with a shadow_used_idx */ + used_idx = le16toh(used->idx); + + smp_rmb(); + if (used_idx == c->vq[qid].last_used_idx) { + *len = 0; + return NULL; + } + + last_used = c->vq[qid].last_used_idx & VHOST_NDESCS; + i = le32toh(used->ring[last_used].id); + *len = le32toh(used->ring[last_used].len); + + if (i > VHOST_NDESCS) { + /* TODO imporove this, it will cause infinite loop */ + warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS);Does this indicate the kernel has done something wrong / unexpected? If so, it's probably ok to just die().+ return NULL; + } + + if (*len > PKT_BUF_BYTES/VHOST_NDESCS) { + warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS); + return NULL;As discussed on the call, we always want there to be room for maximum size of frame. You can actually define that maximum frame size to suit yourself, say L2_MAX_LEN_VHOST similar to the existing L2_MAX_LEN_PASTA, L2_MAX_LEN_VU etc, but it should be in the vicinity of 64k. The you should set the number of descriptors based on how many frames you can it in the buffer, rather than changing the maximum frame size based on the number of descriptors.+ } + + /* TODO check if the id is valid and it has not been double used */ + c->vq[qid].last_used_idx++; + return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); +} + +/* container is tx but we receive it from vhost POV */You're probably already aware, but we're pretty strict about having kerneldoc style function header comments.+void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now)For consistency with the other backends, I'd suggest calling this "tap_vhost_input()".+{ + eventfd_read(ref.fd, (eventfd_t[]){ 0 }); + + tap_flush_pools(); + + while (true) { + struct virtio_net_hdr_mrg_rxbuf *hdr; + unsigned len; + + hdr = virtqueue_get_buf(c, ref.queue, &len); + if (!hdr) + break; + + if (len < sizeof(*hdr)) { + warn("vhost: invalid len %u", len); + continue; + } + + /* TODO this will break from this moment */ + if (hdr->num_buffers != 1) { + warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS); + continue; + } + + /* TODO fix the v6 pool to support ipv6 */ + tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6); + } + + tap_handler(c, now, pool_virtiorx4, pool_virtiorx6); +} + +/* TODO: Actually refill */ +static void rx_pkt_refill(int kick_fd) +{ + for (unsigned i = 0; i < VHOST_NDESCS; ++i) { + vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); + vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS; + vring_desc[0][i].flags = VRING_DESC_F_WRITE; + } + + vring_avail_0.avail.idx = VHOST_NDESCS; + eventfd_write(kick_fd, 1); +} + /** * tap_ns_tun() - Get tuntap fd in namespace * @c: Execution context @@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events) */ static int tap_ns_tun(void *arg) { + /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */ + static const uint64_t features = + (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR); struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI }; int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC; struct ctx *c = (struct ctx *)arg; - int fd, rc; + int fd, vhost_fd, rc; c->fd_tap = -1; memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ); @@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg) if (fd < 0) die_perror("Failed to open() /dev/net/tun");Probably worth moving most of the vhost user specific stuff into a separate function.+ vhost_fd = open("/dev/vhost-net", flags); + if (vhost_fd < 0) + die_perror("Failed to open() /dev/vhost-net"); + + rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL); + if (rc < 0) + die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed"); + + rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed"); + + /* TODO inform more explicitely */ + fprintf(stderr, "vhost features: %lx\n", c->virtio_features); + fprintf(stderr, "req features: %lx\n", features); + c->virtio_features &= features; + if (c->virtio_features != features) + die("vhost does not support required features"); + + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_file file = { + .index = i, + }; + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL, + .queue = i }; + struct epoll_event ev; + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + ref.fd = file.fd; + if (file.fd < 0) + die_perror("Failed to create call eventfd"); + + rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed"); + + ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN }; + rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); + if (rc < 0) + die_perror("Failed to add call eventfd to epoll"); + c->vq[i].call_fd = file.fd; + } + + /* 1:1 translation */ + vhost_memory.mem.regions[0] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_desc, + /* memory size should include the last byte, but we probably never send + * ptrs there so...Not sure what this comment is getting at.+ */ + .memory_size = sizeof(vring_desc), + .userspace_addr = (uintptr_t)&vring_desc, + }; + vhost_memory.mem.regions[1] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_avail_0, + }; + vhost_memory.mem.regions[2] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_avail_1, + }; + vhost_memory.mem.regions[3] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_used_0, + }; + vhost_memory.mem.regions[4] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_used_1, + }; + vhost_memory.mem.regions[5] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(virtio_rx_pkt_buf), + .userspace_addr = (uintptr_t)virtio_rx_pkt_buf, + }; + vhost_memory.mem.regions[6] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(pkt_buf), + .userspace_addr = (uintptr_t)pkt_buf, + }; + static_assert(6 < N_VHOST_REGIONS); + + rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem); + if (rc < 0) + die_perror( + "VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed"); + + /* TODO: probably it increases RX perf */ +#if 0 + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + + if (ioctl(fd, TUNGETIFF, &ifr) != 0) + die_perror("Unable to query TUNGETIFF on FD %d", fd); + } + + if (ifr.ifr_flags & IFF_VNET_HDR) + net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF); +#endif + rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed"); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_addr addr = { + .index = i, + .desc_user_addr = (unsigned long)vring_desc[i], + .avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 : + (unsigned long)&vring_avail_1, + .used_user_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + /* GPA addr */ + .log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + }; + struct vhost_vring_state state = { + .index = i, + .num = VHOST_NDESCS, + }; + struct vhost_vring_file file = { + .index = i, + }; + + rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed"); + + fprintf(stderr, "qid: %d\n", i); + fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr); + fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr); + fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr); + rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed"); + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (file.fd < 0) + die_perror("Failed to create kick eventfd"); + rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed"); + c->vq[i].kick_fd = file.fd; + } + rc = ioctl(fd, (int)TUNSETIFF, &ifr); if (rc < 0) die_perror("TUNSETIFF ioctl on /dev/net/tun failed"); @@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg) if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn))) die("Tap device opened but no network interface found"); + rx_pkt_refill(c->vq[0].kick_fd); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_file file = { + .index = i, + .fd = fd, + }; + + fprintf(stderr, "qid: %d\n", file.index); + fprintf(stderr, "tap fd: %d\n", file.fd); + rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file); + if (rc < 0) + die_perror( + "VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed"); + } + c->fd_tap = fd; + c->fd_vhost = vhost_fd; return 0; } diff --git a/tap.h b/tap.h index 0b5ad17..91d3f62 100644 --- a/tap.h +++ b/tap.h @@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now); + unsigned long tap_l2_max_len(const struct ctx *c); void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
In case the kernel needs to signal an error. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com> --- epoll_type.h | 2 ++ passt.c | 4 ++++ passt.h | 1 + tap.c | 16 ++++++++++++++++ 4 files changed, 23 insertions(+) diff --git a/epoll_type.h b/epoll_type.h index 6284c79..6b320db 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -46,6 +46,8 @@ enum epoll_type { EPOLL_TYPE_REPAIR, /* vhost-kernel call socket */ EPOLL_TYPE_VHOST_CALL, + /* vhost-kernel error socket */ + EPOLL_TYPE_VHOST_ERROR, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index 19c5d5f..2779e0b 100644 --- a/passt.c +++ b/passt.c @@ -80,6 +80,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket", + [EPOLL_TYPE_VHOST_ERROR] = "vhost-kernel error socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -361,6 +362,9 @@ loop: case EPOLL_TYPE_VHOST_CALL: vhost_call_cb(&c, ref, &now); break; + case EPOLL_TYPE_VHOST_ERROR: + die("Error on vhost-kernel socket"); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index eb5aa03..9e42f3b 100644 --- a/passt.h +++ b/passt.h @@ -307,6 +307,7 @@ struct ctx { int kick_fd; int call_fd; + int err_fd; } vq[2]; int no_dns; diff --git a/tap.c b/tap.c index fbe83aa..b02d3da 100644 --- a/tap.c +++ b/tap.c @@ -1552,6 +1552,22 @@ static int tap_ns_tun(void *arg) if (rc < 0) die_perror("Failed to add call eventfd to epoll"); c->vq[i].call_fd = file.fd; + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (file.fd < 0) + die_perror("Failed to create error eventfd"); + + rc = ioctl(vhost_fd, VHOST_SET_VRING_ERR, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_ERR ioctl on /dev/vhost-net failed"); + + ref.type = EPOLL_TYPE_VHOST_ERROR, ref.fd = file.fd; + ev.data.u64 = ref.u64; + rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); + if (rc < 0) + die_perror("Failed to add error eventfd to epoll"); + c->vq[i].err_fd = file.fd; } /* 1:1 translation */ -- 2.49.0
On Tue, Apr 01, 2025 at 01:38:09PM +0200, Eugenio Pérez wrote:In case the kernel needs to signal an error. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com> --- epoll_type.h | 2 ++ passt.c | 4 ++++ passt.h | 1 + tap.c | 16 ++++++++++++++++ 4 files changed, 23 insertions(+) diff --git a/epoll_type.h b/epoll_type.h index 6284c79..6b320db 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -46,6 +46,8 @@ enum epoll_type { EPOLL_TYPE_REPAIR, /* vhost-kernel call socket */ EPOLL_TYPE_VHOST_CALL, + /* vhost-kernel error socket */ + EPOLL_TYPE_VHOST_ERROR, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index 19c5d5f..2779e0b 100644 --- a/passt.c +++ b/passt.c @@ -80,6 +80,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket", + [EPOLL_TYPE_VHOST_ERROR] = "vhost-kernel error socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -361,6 +362,9 @@ loop: case EPOLL_TYPE_VHOST_CALL: vhost_call_cb(&c, ref, &now); break; + case EPOLL_TYPE_VHOST_ERROR: + die("Error on vhost-kernel socket"); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index eb5aa03..9e42f3b 100644 --- a/passt.h +++ b/passt.h @@ -307,6 +307,7 @@ struct ctx { int kick_fd; int call_fd; + int err_fd; } vq[2]; int no_dns; diff --git a/tap.c b/tap.c index fbe83aa..b02d3da 100644 --- a/tap.c +++ b/tap.c @@ -1552,6 +1552,22 @@ static int tap_ns_tun(void *arg) if (rc < 0) die_perror("Failed to add call eventfd to epoll"); c->vq[i].call_fd = file.fd; + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (file.fd < 0) + die_perror("Failed to create error eventfd"); + + rc = ioctl(vhost_fd, VHOST_SET_VRING_ERR, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_ERR ioctl on /dev/vhost-net failed"); + + ref.type = EPOLL_TYPE_VHOST_ERROR, ref.fd = file.fd; + ev.data.u64 = ref.u64; + rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); + if (rc < 0) + die_perror("Failed to add error eventfd to epoll"); + c->vq[i].err_fd = file.fd;If it's possible to re-initialize vhost, eventually it would be nice to do so eventually, but for now die()ing is perfectly reasonable.} /* 1:1 translation */-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
On Thu, Apr 3, 2025 at 3:51 AM David Gibson <david(a)gibson.dropbear.id.au> wrote:On Tue, Apr 01, 2025 at 01:38:09PM +0200, Eugenio Pérez wrote:That's a good point, adding TODO!In case the kernel needs to signal an error. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com> --- epoll_type.h | 2 ++ passt.c | 4 ++++ passt.h | 1 + tap.c | 16 ++++++++++++++++ 4 files changed, 23 insertions(+) diff --git a/epoll_type.h b/epoll_type.h index 6284c79..6b320db 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -46,6 +46,8 @@ enum epoll_type { EPOLL_TYPE_REPAIR, /* vhost-kernel call socket */ EPOLL_TYPE_VHOST_CALL, + /* vhost-kernel error socket */ + EPOLL_TYPE_VHOST_ERROR, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index 19c5d5f..2779e0b 100644 --- a/passt.c +++ b/passt.c @@ -80,6 +80,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket", + [EPOLL_TYPE_VHOST_ERROR] = "vhost-kernel error socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -361,6 +362,9 @@ loop: case EPOLL_TYPE_VHOST_CALL: vhost_call_cb(&c, ref, &now); break; + case EPOLL_TYPE_VHOST_ERROR: + die("Error on vhost-kernel socket"); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index eb5aa03..9e42f3b 100644 --- a/passt.h +++ b/passt.h @@ -307,6 +307,7 @@ struct ctx { int kick_fd; int call_fd; + int err_fd; } vq[2]; int no_dns; diff --git a/tap.c b/tap.c index fbe83aa..b02d3da 100644 --- a/tap.c +++ b/tap.c @@ -1552,6 +1552,22 @@ static int tap_ns_tun(void *arg) if (rc < 0) die_perror("Failed to add call eventfd to epoll"); c->vq[i].call_fd = file.fd; + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (file.fd < 0) + die_perror("Failed to create error eventfd"); + + rc = ioctl(vhost_fd, VHOST_SET_VRING_ERR, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_ERR ioctl on /dev/vhost-net failed"); + + ref.type = EPOLL_TYPE_VHOST_ERROR, ref.fd = file.fd; + ev.data.u64 = ref.u64; + rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); + if (rc < 0) + die_perror("Failed to add error eventfd to epoll"); + c->vq[i].err_fd = file.fd;If it's possible to re-initialize vhost, eventually it would be nice to do so eventually, but for now die()ing is perfectly reasonable.