This is the main callback when the tap device has processed any buffer. Another possibility is to reuse the tap callback for this, so less code changes are needed. Signed-off-by: Eugenio Pérez <eperezma(a)redhat.com> --- epoll_type.h | 2 + passt.c | 4 + passt.h | 12 +- tap.c | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++- tap.h | 2 + 5 files changed, 334 insertions(+), 2 deletions(-) diff --git a/epoll_type.h b/epoll_type.h index 7f2a121..6284c79 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -44,6 +44,8 @@ enum epoll_type { EPOLL_TYPE_REPAIR_LISTEN, /* TCP_REPAIR helper socket */ EPOLL_TYPE_REPAIR, + /* vhost-kernel call socket */ + EPOLL_TYPE_VHOST_CALL, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index cd06772..19c5d5f 100644 --- a/passt.c +++ b/passt.c @@ -79,6 +79,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", + [EPOLL_TYPE_VHOST_CALL] = "vhost-kernel call socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -357,6 +358,9 @@ loop: case EPOLL_TYPE_REPAIR: repair_handler(&c, eventmask); break; + case EPOLL_TYPE_VHOST_CALL: + vhost_call_cb(&c, ref, &now); + break; default: /* Can't happen */ ASSERT(0); diff --git a/passt.h b/passt.h index 8f45091..eb5aa03 100644 --- a/passt.h +++ b/passt.h @@ -45,7 +45,7 @@ union epoll_ref; * @icmp: ICMP-specific reference part * @data: Data handled by protocol handlers * @nsdir_fd: netns dirfd for fallback timer checking if namespace is gone - * @queue: vhost-user queue index for this fd + * @queue: vhost queue index for this fd * @u64: Opaque reference for epoll_ctl() and epoll_wait() */ union epoll_ref { @@ -271,11 +271,14 @@ struct ctx { int fd_tap; int fd_repair_listen; int fd_repair; + /* TODO document all added fields */ + int fd_vhost; unsigned char our_tap_mac[ETH_ALEN]; unsigned char guest_mac[ETH_ALEN]; uint16_t mtu; uint64_t hash_secret[2]; + uint64_t virtio_features; int ifi4; struct ip4_ctx ip4; @@ -299,6 +302,13 @@ struct ctx { int no_icmp; struct icmp_ctx icmp; + struct { + uint16_t last_used_idx; + + int kick_fd; + int call_fd; + } vq[2]; + int no_dns; int no_dns_search; int no_dhcp_dns; diff --git a/tap.c b/tap.c index ce859ba..fbe83aa 100644 --- a/tap.c +++ b/tap.c @@ -31,6 +31,7 @@ #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> +#include <sys/eventfd.h> #include <sys/uio.h> #include <stdbool.h> #include <stdlib.h> @@ -82,6 +83,46 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS, pkt_buf); #define TAP_SEQS 128 /* Different L4 tuples in one batch */ #define FRAGMENT_MSG_RATE 10 /* # seconds between fragment warnings */ +#define VHOST_VIRTIO 0xAF +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) +#define VHOST_SET_BACKEND_FEATURES _IOW(VHOST_VIRTIO, 0x25, __u64) +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file) + +char virtio_rx_pkt_buf[PKT_BUF_BYTES] __attribute__((aligned(PAGE_SIZE))); +static PACKET_POOL_NOINIT(pool_virtiorx4, TAP_MSGS, virtio_rx_pkt_buf); +static PACKET_POOL_NOINIT(pool_virtiorx6, TAP_MSGS, virtio_rx_pkt_buf); + +/* TODO is it better to have 1024 or 65520 bytes per packet? */ +#define VHOST_NDESCS 1024 +static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE))); +static union { + struct vring_avail avail; + char buf[offsetof(struct vring_avail, ring[VHOST_NDESCS])]; +} vring_avail_0 __attribute__((aligned(PAGE_SIZE))), vring_avail_1 __attribute__((aligned(PAGE_SIZE))); +static union { + struct vring_used used; + char buf[offsetof(struct vring_used, ring[VHOST_NDESCS])]; +} vring_used_0 __attribute__((aligned(PAGE_SIZE))), vring_used_1 __attribute__((aligned(PAGE_SIZE))); + +/* all descs ring + 2rings * 2vqs + tx pkt buf + rx pkt buf */ +#define N_VHOST_REGIONS 7 +union { + struct vhost_memory mem; + char buf[offsetof(struct vhost_memory, regions[N_VHOST_REGIONS])]; +} vhost_memory = { + .mem = { + .nregions = N_VHOST_REGIONS, + }, +}; + /** * tap_l2_max_len() - Maximum frame size (including L2 header) for current mode * @c: Execution context @@ -1360,6 +1401,89 @@ void tap_listen_handler(struct ctx *c, uint32_t events) tap_start_connection(c); } +static void *virtqueue_get_buf(struct ctx *c, unsigned qid, unsigned *len) +{ + struct vring_used *used = !qid ? &vring_used_0.used : &vring_used_1.used; + uint32_t i; + uint16_t used_idx, last_used; + + /* TODO think if this has races with previous eventfd_read */ + /* TODO we could improve performance with a shadow_used_idx */ + used_idx = le16toh(used->idx); + + smp_rmb(); + + if (used_idx == c->vq[qid].last_used_idx) { + *len = 0; + return NULL; + } + + last_used = c->vq[qid].last_used_idx & VHOST_NDESCS; + i = le32toh(used->ring[last_used].id); + *len = le32toh(used->ring[last_used].len); + + if (i > VHOST_NDESCS) { + /* TODO imporove this, it will cause infinite loop */ + warn("vhost: id %u at used position %u out of range (max=%u)", i, last_used, VHOST_NDESCS); + return NULL; + } + + if (*len > PKT_BUF_BYTES/VHOST_NDESCS) { + warn("vhost: id %d len %u > %zu", i, *len, PKT_BUF_BYTES/VHOST_NDESCS); + return NULL; + } + + /* TODO check if the id is valid and it has not been double used */ + c->vq[qid].last_used_idx++; + return virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); +} + +/* container is tx but we receive it from vhost POV */ +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now) +{ + eventfd_read(ref.fd, (eventfd_t[]){ 0 }); + + tap_flush_pools(); + + while (true) { + struct virtio_net_hdr_mrg_rxbuf *hdr; + unsigned len; + + hdr = virtqueue_get_buf(c, ref.queue, &len); + if (!hdr) + break; + + if (len < sizeof(*hdr)) { + warn("vhost: invalid len %u", len); + continue; + } + + /* TODO this will break from this moment */ + if (hdr->num_buffers != 1) { + warn("vhost: Too many buffers %u, %zu bytes should be enough for everybody!", hdr->num_buffers, PKT_BUF_BYTES/VHOST_NDESCS); + continue; + } + + /* TODO fix the v6 pool to support ipv6 */ + tap_add_packet(c, len - sizeof(*hdr), (void *)(hdr+1), pool_virtiorx4, pool_virtiorx6); + } + + tap_handler(c, now, pool_virtiorx4, pool_virtiorx6); +} + +/* TODO: Actually refill */ +static void rx_pkt_refill(int kick_fd) +{ + for (unsigned i = 0; i < VHOST_NDESCS; ++i) { + vring_desc[0][i].addr = (uintptr_t)virtio_rx_pkt_buf + i * (PKT_BUF_BYTES/VHOST_NDESCS); + vring_desc[0][i].len = PKT_BUF_BYTES/VHOST_NDESCS; + vring_desc[0][i].flags = VRING_DESC_F_WRITE; + } + + vring_avail_0.avail.idx = VHOST_NDESCS; + eventfd_write(kick_fd, 1); +} + /** * tap_ns_tun() - Get tuntap fd in namespace * @c: Execution context @@ -1370,10 +1494,13 @@ void tap_listen_handler(struct ctx *c, uint32_t events) */ static int tap_ns_tun(void *arg) { + /* TODO we need to check if vhost support VIRTIO_NET_F_MRG_RXBUF and VHOST_NET_F_VIRTIO_NET_HDR actually */ + static const uint64_t features = + (1ULL << VIRTIO_F_VERSION_1) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR); struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI }; int flags = O_RDWR | O_NONBLOCK | O_CLOEXEC; struct ctx *c = (struct ctx *)arg; - int fd, rc; + int fd, vhost_fd, rc; c->fd_tap = -1; memcpy(ifr.ifr_name, c->pasta_ifn, IFNAMSIZ); @@ -1383,6 +1510,175 @@ static int tap_ns_tun(void *arg) if (fd < 0) die_perror("Failed to open() /dev/net/tun"); + vhost_fd = open("/dev/vhost-net", flags); + if (vhost_fd < 0) + die_perror("Failed to open() /dev/vhost-net"); + + rc = ioctl(vhost_fd, VHOST_SET_OWNER, NULL); + if (rc < 0) + die_perror("VHOST_SET_OWNER ioctl on /dev/vhost-net failed"); + + rc = ioctl(vhost_fd, VHOST_GET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_GET_FEATURES ioctl on /dev/vhost-net failed"); + + /* TODO inform more explicitely */ + fprintf(stderr, "vhost features: %lx\n", c->virtio_features); + fprintf(stderr, "req features: %lx\n", features); + c->virtio_features &= features; + if (c->virtio_features != features) + die("vhost does not support required features"); + + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_file file = { + .index = i, + }; + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_CALL, + .queue = i }; + struct epoll_event ev; + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + ref.fd = file.fd; + if (file.fd < 0) + die_perror("Failed to create call eventfd"); + + rc = ioctl(vhost_fd, VHOST_SET_VRING_CALL, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_CALL ioctl on /dev/vhost-net failed"); + + ev = (struct epoll_event){ .data.u64 = ref.u64, .events = EPOLLIN }; + rc = epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); + if (rc < 0) + die_perror("Failed to add call eventfd to epoll"); + c->vq[i].call_fd = file.fd; + } + + /* 1:1 translation */ + vhost_memory.mem.regions[0] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_desc, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_desc), + .userspace_addr = (uintptr_t)&vring_desc, + }; + vhost_memory.mem.regions[1] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_avail_0, + }; + vhost_memory.mem.regions[2] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_avail_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_avail_1, + }; + vhost_memory.mem.regions[3] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_0, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_0), + .userspace_addr = (uintptr_t)&vring_used_0, + }; + vhost_memory.mem.regions[4] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)&vring_used_1, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(vring_avail_1), + .userspace_addr = (uintptr_t)&vring_used_1, + }; + vhost_memory.mem.regions[5] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)virtio_rx_pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(virtio_rx_pkt_buf), + .userspace_addr = (uintptr_t)virtio_rx_pkt_buf, + }; + vhost_memory.mem.regions[6] = (struct vhost_memory_region){ + .guest_phys_addr = (uintptr_t)pkt_buf, + /* memory size should include the last byte, but we probably never send + * ptrs there so... + */ + .memory_size = sizeof(pkt_buf), + .userspace_addr = (uintptr_t)pkt_buf, + }; + static_assert(6 < N_VHOST_REGIONS); + + rc = ioctl(vhost_fd, VHOST_SET_MEM_TABLE, &vhost_memory.mem); + if (rc < 0) + die_perror( + "VHOST_SET_MEM_TABLE ioctl on /dev/vhost-net failed"); + + /* TODO: probably it increases RX perf */ +#if 0 + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + + if (ioctl(fd, TUNGETIFF, &ifr) != 0) + die_perror("Unable to query TUNGETIFF on FD %d", fd); + } + + if (ifr.ifr_flags & IFF_VNET_HDR) + net->dev.features &= ~(1ULL << VIRTIO_NET_F_MRG_RXBUF); +#endif + rc = ioctl(vhost_fd, VHOST_SET_FEATURES, &c->virtio_features); + if (rc < 0) + die_perror("VHOST_SET_FEATURES ioctl on /dev/vhost-net failed"); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_addr addr = { + .index = i, + .desc_user_addr = (unsigned long)vring_desc[i], + .avail_user_addr = i == 0 ? (unsigned long)&vring_avail_0 : + (unsigned long)&vring_avail_1, + .used_user_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + /* GPA addr */ + .log_guest_addr = i == 0 ? (unsigned long)&vring_used_0 : + (unsigned long)&vring_used_1, + }; + struct vhost_vring_state state = { + .index = i, + .num = VHOST_NDESCS, + }; + struct vhost_vring_file file = { + .index = i, + }; + + rc = ioctl(vhost_fd, VHOST_SET_VRING_NUM, &state); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_NUM ioctl on /dev/vhost-net failed"); + + fprintf(stderr, "qid: %d\n", i); + fprintf(stderr, "vhost desc addr: 0x%llx\n", addr.desc_user_addr); + fprintf(stderr, "vhost avail addr: 0x%llx\n", addr.avail_user_addr); + fprintf(stderr, "vhost used addr: 0x%llx\n", addr.used_user_addr); + rc = ioctl(vhost_fd, VHOST_SET_VRING_ADDR, &addr); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_ADDR ioctl on /dev/vhost-net failed"); + + file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (file.fd < 0) + die_perror("Failed to create kick eventfd"); + rc = ioctl(vhost_fd, VHOST_SET_VRING_KICK, &file); + if (rc < 0) + die_perror( + "VHOST_SET_VRING_KICK ioctl on /dev/vhost-net failed"); + c->vq[i].kick_fd = file.fd; + } + rc = ioctl(fd, (int)TUNSETIFF, &ifr); if (rc < 0) die_perror("TUNSETIFF ioctl on /dev/net/tun failed"); @@ -1390,7 +1686,25 @@ static int tap_ns_tun(void *arg) if (!(c->pasta_ifi = if_nametoindex(c->pasta_ifn))) die("Tap device opened but no network interface found"); + rx_pkt_refill(c->vq[0].kick_fd); + + /* Duplicating foreach queue to follow the exact order from QEMU */ + for (int i = 0; i < ARRAY_SIZE(c->vq); i++) { + struct vhost_vring_file file = { + .index = i, + .fd = fd, + }; + + fprintf(stderr, "qid: %d\n", file.index); + fprintf(stderr, "tap fd: %d\n", file.fd); + rc = ioctl(vhost_fd, VHOST_NET_SET_BACKEND, &file); + if (rc < 0) + die_perror( + "VHOST_NET_SET_BACKEND ioctl on /dev/vhost-net failed"); + } + c->fd_tap = fd; + c->fd_vhost = vhost_fd; return 0; } diff --git a/tap.h b/tap.h index 0b5ad17..91d3f62 100644 --- a/tap.h +++ b/tap.h @@ -69,6 +69,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) thdr->vnet_len = htonl(l2len); } +void vhost_call_cb(struct ctx *c, union epoll_ref ref, const struct timespec *now); + unsigned long tap_l2_max_len(const struct ctx *c); void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto); void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src, -- 2.49.0