On Wed, Apr 2, 2025 at 9:16 AM Stefano Brivio <sbrivio(a)redhat.com> wrote:
A couple of general notes:
- there's no need to Cc: people already on this list unless you need
their attention specifically (it can get quite noisy...)
Got it, I'll do for the next time.
- things kind of make sense to me, many are hard to
evaluate at this
early stage, so I noted below just some specific comments/questions
here, but in the sense of "being on the same page" my current answer
is... yes, I guess so!
- I'm not reviewing 1/3 and 3/3 right away as I guess you'll revisit
them anyway, I'm just not sure we need a separate pool... but I'm not
sure if that's temporary either.
On Tue, 1 Apr 2025 13:38:08 +0200
Eugenio Pérez <eperezma(a)redhat.com> wrote:
This is the main callback when the tap device has
processed any buffer.
Another possibility is to reuse the tap callback for this, so less code
changes are needed.
Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com>
---
epoll_type.h | 2 +
passt.c | 4 +
passt.h | 12 +-
tap.c | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++-
tap.h | 2 +
5 files changed, 334 insertions(+), 2 deletions(-)
diff --git a/epoll_type.h b/epoll_type.h
index 7f2a121..6284c79 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -44,6 +44,8 @@ enum epoll_type {
EPOLL_TYPE_REPAIR_LISTEN,
/* TCP_REPAIR helper socket */
EPOLL_TYPE_REPAIR,
+ /* vhost-kernel call socket */
+ EPOLL_TYPE_VHOST_CALL,
EPOLL_NUM_TYPES,
};
diff --git a/passt.c b/passt.c
index cd06772..19c5d5f 100644
--- a/passt.c
+++ b/passt.c
@@ -79,6 +79,7 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
[EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
[EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
+ [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@@ -357,6 +358,9 @@ loop:
case EPOLL_TYPE_REPAIR:
repair_handler(&c, eventmask);
break;
+ case EPOLL_TYPE_VHOST_CALL:
+ vhost_call_cb(&c, ref, &now);
+ break;
default:
/* Can't happen */
ASSERT(0);
diff --git a/passt.h b/passt.h
index 8f45091..eb5aa03 100644
--- a/passt.h
+++ b/passt.h
@@ -45,7 +45,7 @@ union epoll_ref;
* @icmp: ICMP-specific reference part
* @data: Data handled by protocol handlers
* @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone
- * @queue: vhost-user queue index for this fd
+ * @queue: vhost queue index for this fd
* @u64: Opaque reference for epoll_ctl() and epoll_wait()
*/
union epoll_ref {
@@ -271,11 +271,14 @@ struct ctx {
int fd_tap;
int fd_repair_listen;
int fd_repair;
+ /* TODO document all added fields */
+ int fd_vhost;
unsigned char our_tap_mac[ETH_ALEN];
unsigned char guest_mac[ETH_ALEN];
uint16_t mtu;
uint64_t hash_secret[2];
+ uint64_t virtio_features;
int ifi4;
struct ip4_ctx ip4;
@@ -299,6 +302,13 @@ struct ctx {
int no_icmp;
struct icmp_ctx icmp;
+ struct {
+ uint16_t last_used_idx;
+
+ int kick_fd;
+ int call_fd;
+ } vq[2];
+
int no_dns;
int no_dns_search;
int no_dhcp_dns;
diff --git a/tap.c b/tap.c
index ce859ba..fbe83aa 100644
--- a/tap.c
+++ b/tap.c
@@ -31,6 +31,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
+#include <sys/eventfd.h>
Why do we need eventfds here? Is there anything peculiar in the
protocol, or it's all stuff that can be done with "regular" file
descriptors plus epoll?
Yes, vhost-net trusts eventfd for signalling.
#include
<sys/uio.h>
#include <stdbool.h>
#include <stdlib.h>
@@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf);
#define TAP_SEQS 128 /* Different L4 tuples in one batch */
#define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */
+#define VHOST_VIRTIO 0xAF
+#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
+#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
+#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
+#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
+#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
+#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
+#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
+#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
+#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64)
+#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
Coding style (no strict requirement): align those nicely/table-like if
possible.
I'll fix it (and the rest of code style comments) in the next RFC!
+
+char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE)));
+static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf);
+static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf);
+
+/* TODO is it better to have 1024 or 65520 bytes per packet? */
In general 65535 bytes (including the Ethernet header) appears to be a
good idea, but actual profiling would be nice in the long term.
That can be done for sure.
+#define
VHOST_NDESCS 1024
+static struct vring_desc vring_desc[2][VHOST_NDESCS]
__attribute__((aligned(PAGE_SIZE)));
Coding style, here and a bit all over the place: wrap to 80 columns
("net" kernel-like style).
+static union {
+ struct vring_avail avail;
+ char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])];
+} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1
__attribute__((aligned(PAGE_SIZE)));
+static union {
+ struct vring_used used;
+ char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])];
+} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1
__attribute__((aligned(PAGE_SIZE)));
+
+/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */
What's the "all descs ring"? A short "theory of operation"
section
might help eventually.
The vhost kernel module expects these descriptors ring for the buffer
addresses & length. I can add more docs for sure.
+#define
N_VHOST_REGIONS 7
+union {
+ struct vhost_memory mem;
+ char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])];
+} vhost_memory = {
+ .mem = {
+ .nregions = N_VHOST_REGIONS,
+ },
+};
+
/**
* tap_l2_max_len() - Maximum frame size (including L2 header) for current mode
* @c: Execution context
@@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
tap_start_connection(c);
}
+static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len)
+{
+ struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used;
+ uint32_t i;
+ uint16_t used_idx, last_used;
+
+ /* TODO think if this has races with previous eventfd_read */
+ /* TODO we could improve performance with a shadow_used_idx */
+ used_idx = le16toh(used->idx);
+
+ smp_rmb();
+
+ if (used_idx == c->vq[qid].last_used_idx) {
+ *len = 0;
+ return NULL;
+ }
+
+ last_used = c->vq[qid].last_used_idx & VHOST_NDESCS;
+ i = le32toh(used->ring[last_used].id);
+ *len = le32toh(used->ring[last_used].len);
+
+ if (i > VHOST_NDESCS) {
+ /* TODO imporove this, it will cause infinite loop */
+ warn("vhost: id %u at used position %u out of range (max=%u)", i,
last_used, VHOST_NDESCS);
+ return NULL;
+ }
+
+ if (*len > PKT_BUF_BYTES/VHOST_NDESCS) {
+ warn("vhost: id %d len %u > %zu", i, *len,
PKT_BUF_BYTES/VHOST_NDESCS);
+ return NULL;
+ }
+
+ /* TODO check if the id is valid and it has not been double used */
+ c->vq[qid].last_used_idx++;
+ return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS);
+}
+
+/* container is tx but we receive it from vhost POV */
+void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now)
+{
+ eventfd_read(ref.fd, (eventfd_t[]){ 0 });
+
+ tap_flush_pools();
+
+ while (true) {
+ struct virtio_net_hdr_mrg_rxbuf *hdr;
+ unsigned len;
+
+ hdr = virtqueue_get_buf(c, ref.queue, &len);
+ if (!hdr)
+ break;
+
+ if (len < sizeof(*hdr)) {
+ warn("vhost: invalid len %u", len);
+ continue;
+ }
+
+ /* TODO this will break from this moment */
+ if (hdr->num_buffers != 1) {
+ warn("vhost: Too many buffers %u, %zu bytes should be enough
for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS);
+ continue;
+ }
+
+ /* TODO fix the v6 pool to support ipv6 */
+ tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4,
pool_virtiorx6);
+ }
+
+ tap_handler(c, now, pool_virtiorx4, pool_virtiorx6);
+}
+
+/* TODO: Actually refill */
+static void rx_pkt_refill(int kick_fd)
+{
+ for (unsigned i = 0; i < VHOST_NDESCS; ++i) {
+ vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i *
(PKT_BUF_BYTES/VHOST_NDESCS);
+ vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS;
+ vring_desc[0][i].flags = VRING_DESC_F_WRITE;
+ }
+
+ vring_avail_0.avail.idx = VHOST_NDESCS;
+ eventfd_write(kick_fd, 1);
+}
+
/**
* tap_ns_tun() - Get tuntap fd in namespace
* @c: Execution context
@@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events)
*/
static int tap_ns_tun(void *arg)
{
+ /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and
VHOST_NET_F_VIRTIO_NET_HDR actually */
+ static const uint64_t features =
+ (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF)
| (1ULL << VHOST_NET_F_VIRTIO_NET_HDR);
struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI };
I kind of wonder, by the way, if IFF_TUN simplifies things here. It's
something we should already add, as an option, see also:
https://bugs.passt.top/show_bug.cgi?id=49, but if it makes your life
easier for any reason you might consider adding it right away.
I'm not sure, but maybe we can reuse something, yes!
int
flags = O_RDWR | O_NONBLOCK | O_CLOEXEC;
struct ctx *c = (struct ctx *)arg;
- int fd, rc;
+ int fd, vhost_fd, rc;
c->fd_tap = -1;
memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ);
@@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg)
if (fd < 0)
die_perror("Failed to open() /dev/net/tun");
+ vhost_fd = open("/dev/vhost-net", flags);
+ if (vhost_fd < 0)
+ die_perror("Failed to open() /dev/vhost-net");
Note pretty much to future self: this will need adjustments to AppArmor
and SELinux policies.
+
+ rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL);
+ if (rc < 0)
+ die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed");
+
+ rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features);
+ if (rc < 0)
+ die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed");
+
+ /* TODO inform more explicitely */
+ fprintf(stderr, "vhost features: %lx\n", c->virtio_features);
+ fprintf(stderr, "req features: %lx\n", features);
+ c->virtio_features &= features;
+ if (c->virtio_features != features)
+ die("vhost does not support required features");
+
+ for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
No declarations directly in loops (it hides them somehow).
+ struct vhost_vring_file file = {
+ .index = i,
+ };
+ union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL,
+ .queue = i };
+ struct epoll_event ev;
+
+ file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ ref.fd = file.fd;
+ if (file.fd < 0)
+ die_perror("Failed to create call eventfd");
+
+ rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_VRING_CALL ioctl on /dev/vhost-net
failed");
Same as net kernel style: if it's more than a line, even as a single
statement, use curly brackets (rationale: somebody later adds another
statement without noticing and... oops).
+
+ ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN };
+ rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
+ if (rc < 0)
+ die_perror("Failed to add call eventfd to epoll");
+ c->vq[i].call_fd = file.fd;
+ }
+
+ /* 1:1 translation */
+ vhost_memory.mem.regions[0] = (struct vhost_memory_region){
Space between cast and initialiser, ") {", for consistency. I'll wait
before we have some kind of theory of operation / general description
before actually looking into those, I'm not sure about the exact role of
those seven regions.
In the case of QEMU, vhost reads the descriptor ring with addresses in
the guest address space, as it is a direct communication between them
without QEMU intervening. These regions tells vhost how to translate
these addresses to addresses in the QEMU address space so vhost can
use copy_to_user and copy_from_user.
+
.guest_phys_addr = (uintptr_t)&vring_desc,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_desc),
+ .userspace_addr = (uintptr_t)&vring_desc,
+ };
+ vhost_memory.mem.regions[1] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)&vring_avail_0,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_avail_0),
+ .userspace_addr = (uintptr_t)&vring_avail_0,
+ };
+ vhost_memory.mem.regions[2] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)&vring_avail_1,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_avail_1),
+ .userspace_addr = (uintptr_t)&vring_avail_1,
+ };
+ vhost_memory.mem.regions[3] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)&vring_used_0,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_avail_0),
+ .userspace_addr = (uintptr_t)&vring_used_0,
+ };
+ vhost_memory.mem.regions[4] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)&vring_used_1,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(vring_avail_1),
+ .userspace_addr = (uintptr_t)&vring_used_1,
+ };
+ vhost_memory.mem.regions[5] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(virtio_rx_pkt_buf),
+ .userspace_addr = (uintptr_t)virtio_rx_pkt_buf,
+ };
+ vhost_memory.mem.regions[6] = (struct vhost_memory_region){
+ .guest_phys_addr = (uintptr_t)pkt_buf,
+ /* memory size should include the last byte, but we probably never send
+ * ptrs there so...
+ */
+ .memory_size = sizeof(pkt_buf),
+ .userspace_addr = (uintptr_t)pkt_buf,
+ };
+ static_assert(6 < N_VHOST_REGIONS);
+
+ rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed");
+
+ /* TODO: probably it increases RX perf */
+#if 0
+ struct ifreq ifr;
+ memset(&ifr, 0, sizeof(ifr));
+
+ if (ioctl(fd, TUNGETIFF, &ifr) != 0)
+ die_perror("Unable to query TUNGETIFF on FD %d", fd);
+ }
+
+ if (ifr.ifr_flags & IFF_VNET_HDR)
+ net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF);
+#endif
+ rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features);
+ if (rc < 0)
+ die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed");
+
+ /* Duplicating foreach queue to follow the exact order from QEMU */
+ for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
+ struct vhost_vring_addr addr = {
+ .index = i,
+ .desc_user_addr = (unsigned long)vring_desc[i],
+ .avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 :
+ (unsigned
long)&vring_avail_1,
+ .used_user_addr = i == 0 ? (unsigned long)&vring_used_0 :
+ (unsigned
long)&vring_used_1,
+ /* GPA addr */
+ .log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 :
+ (unsigned
long)&vring_used_1,
+ };
+ struct vhost_vring_state state = {
+ .index = i,
+ .num = VHOST_NDESCS,
+ };
+ struct vhost_vring_file file = {
+ .index = i,
+ };
+
+ rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_VRING_NUM ioctl on /dev/vhost-net
failed");
+
+ fprintf(stderr, "qid: %d\n", i);
+ fprintf(stderr, "vhost desc addr: 0x%llx\n",
addr.desc_user_addr);
+ fprintf(stderr, "vhost avail addr: 0x%llx\n",
addr.avail_user_addr);
+ fprintf(stderr, "vhost used addr: 0x%llx\n",
addr.used_user_addr);
+ rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net
failed");
+
+ file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (file.fd < 0)
+ die_perror("Failed to create kick eventfd");
+ rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file);
+ if (rc < 0)
+ die_perror(
+ "VHOST_SET_VRING_KICK ioctl on /dev/vhost-net
failed");
+ c->vq[i].kick_fd = file.fd;
+ }
+
rc = ioctl(fd, (int)TUNSETIFF, &ifr);
if (rc < 0)
die_perror("TUNSETIFF ioctl on /dev/net/tun failed");
@@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg)
if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn)))
die("Tap device opened but no network interface found");
+ rx_pkt_refill(c->vq[0].kick_fd);
+
+ /* Duplicating foreach queue to follow the exact order from QEMU */
+ for (int i = 0; i < ARRAY_SIZE(c->vq); i++) {
+ struct vhost_vring_file file = {
+ .index = i,
+ .fd = fd,
+ };
+
+ fprintf(stderr, "qid: %d\n", file.index);
+ fprintf(stderr, "tap fd: %d\n", file.fd);
+ rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file);
+ if (rc < 0)
+ die_perror(
+ "VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net
failed");
+ }
+
c->fd_tap = fd;
+ c->fd_vhost = vhost_fd;
return 0;
}
diff --git a/tap.h b/tap.h
index 0b5ad17..91d3f62 100644
--- a/tap.h
+++ b/tap.h
@@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
thdr->vnet_len = htonl(l2len);
}
+void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now);
+
unsigned long tap_l2_max_len(const struct ctx *c);
void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,