On Fri, Jun 21, 2024 at 04:56:37PM +0200, Laurent Vivier wrote:Add virtio.c and virtio.h that define the functions needed to manage virtqueues. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- Makefile | 4 +- util.h | 11 ++ virtio.c | 446 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ virtio.h | 123 +++++++++++++++ 4 files changed, 582 insertions(+), 2 deletions(-) create mode 100644 virtio.c create mode 100644 virtio.h diff --git a/Makefile b/Makefile index 09fc461d087e..39613a7cf1f2 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c udp.c util.c + tcp_buf.c tcp_splice.c udp.c util.c virtio.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - udp.h util.h + udp.h util.h virtio.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/util.h b/util.h index eebb027be487..56c4e2e7b4fe 100644 --- a/util.h +++ b/util.h @@ -48,6 +48,9 @@ #define ROUND_DOWN(x, y) ((x) & ~((y) - 1)) #define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1)) +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) + #define MAX_FROM_BITS(n) (((1U << (n)) - 1)) #define BIT(n) (1UL << (n)) @@ -116,6 +119,14 @@ #define htonl_constant(x) (__bswap_constant_32(x)) #endif +static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } +#define smp_mb() do { barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); } while (0) +#define smp_mb_release() do { barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); } while (0) +#define smp_mb_acquire() do { barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); } while (0) + +#define smp_wmb() smp_mb_release() +#define smp_rmb() smp_mb_acquire() + #define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8) int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, void *arg); diff --git a/virtio.c b/virtio.c new file mode 100644 index 000000000000..50ec8b5119ed --- /dev/null +++ b/virtio.c @@ -0,0 +1,446 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +Needs an actual "Copyright" invocation as well as the SPDX stuff. Which, yes, is a bit fiddly given that it's largely taken from qemu.+/* some parts copied from QEMU subprojects/libvhost-user/libvhost-user.c */So, there are obvious stylistic differences between this and the rest of the passt code for that reason. As I think I said on an earlier draft, I think we need to go fully one way or the other: either a) rewrite this entirely in passt style or b) change the whole thing so little that it's trivial to pull in new versions from qemu. This seems to be somewhere in the middle.+ +#include <stddef.h> +#include <endian.h> +#include <string.h> +#include <errno.h> +#include <sys/eventfd.h> +#include <sys/socket.h> + +#include "util.h" +#include "virtio.h" + +#define VIRTQUEUE_MAX_SIZE 1024 + +/* Translate guest physical address to our virtual address. */ +static void *vu_gpa_to_va(VuDev *dev, uint64_t *plen, uint64_t guest_addr) +{ + unsigned int i; + + if (*plen == 0) + return NULL; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + const VuDevRegion *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { + if ((guest_addr + *plen) > (r->gpa + r->size)) + *plen = r->gpa + r->size - guest_addr; + /* NOLINTNEXTLINE(performance-no-int-to-ptr) */ + return (void *)(guest_addr - r->gpa + r->mmap_addr + r->mmap_offset); + } + } + + return NULL; +} + +static inline uint16_t vring_avail_flags(const VuVirtq *vq) +{ + return le16toh(vq->vring.avail->flags); +} + +static inline uint16_t vring_avail_idx(VuVirtq *vq) +{ + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); + + return vq->shadow_avail_idx; +} + +static inline uint16_t vring_avail_ring(const VuVirtq *vq, int i) +{ + return le16toh(vq->vring.avail->ring[i]); +} + +static inline uint16_t vring_get_used_event(const VuVirtq *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +static bool virtqueue_get_head(VuDev *dev, const VuVirtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) { + vu_panic(dev, "Guest says index %u is available", *head); + return false; + } + + return true; +} + +static int +virtqueue_read_indirect_desc(VuDev *dev, struct vring_desc *desc, + uint64_t addr, size_t len) +{ + uint64_t read_len; + + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) + return -1; + + if (len == 0) + return -1; + + while (len) { + const struct vring_desc *ori_desc; + + read_len = len; + ori_desc = vu_gpa_to_va(dev, &read_len, addr); + if (!ori_desc) + return -1; + + memcpy(desc, ori_desc, read_len); + len -= read_len; + addr += read_len; + desc += read_len; + } + + return 0; +} + +enum { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +static int +virtqueue_read_next_desc(VuDev *dev, const struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) + return VIRTQUEUE_READ_DESC_DONE; + + /* Check they're not leading us off end of descriptors. */ + *next = le16toh(desc[i].next); + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) { + vu_panic(dev, "Desc next is %u", *next); + return VIRTQUEUE_READ_DESC_ERROR; + } + + return VIRTQUEUE_READ_DESC_MORE; +} + +bool vu_queue_empty(const VuDev *dev, VuVirtq *vq) +{ + if (dev->broken || + !vq->vring.avail) { + return true; + } + + if (vq->shadow_avail_idx != vq->last_avail_idx) + return false; + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +static bool vring_notify(const VuDev *dev, VuVirtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(dev, vq)) { + return true; + } + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +/* cppcheck-suppress unusedFunction */ +void vu_queue_notify(VuDev *dev, VuVirtq *vq) +{ + if (dev->broken || !vq->vring.avail) + return; + + if (!vring_notify(dev, vq)) { + debug("skipped notify..."); + return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) + vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); +} + +static inline void vring_set_avail_event(VuVirtq *vq, uint16_t val) +{ + uint16_t val_le = htole16(val); + + if (!vq->notification) + return; + + memcpy(&vq->vring.used->ring[vq->vring.num], &val_le, sizeof(uint16_t)); +} + +static bool virtqueue_map_desc(VuDev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, + uint64_t pa, size_t sz) +{ + unsigned int num_sg = *p_num_sg; + + ASSERT(num_sg <= max_num_sg); + + if (!sz) { + vu_panic(dev, "virtio: zero sized buffers are not allowed"); + return false; + } + + while (sz) { + uint64_t len = sz; + + if (num_sg == max_num_sg) { + vu_panic(dev, "virtio: too many descriptors in indirect table"); + return false; + } + + iov[num_sg].iov_base = vu_gpa_to_va(dev, &len, pa); + if (iov[num_sg].iov_base == NULL) { + vu_panic(dev, "virtio: invalid address for buffers"); + return false; + } + iov[num_sg].iov_len = len; + num_sg++; + sz -= len; + pa += len; + } + + *p_num_sg = num_sg; + return true; +} + +static int +vu_queue_map_desc(VuDev *dev, VuVirtq *vq, unsigned int idx, + VuVirtqElement *elem) +{ + const struct vring_desc *desc = vq->vring.desc; + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + unsigned int out_num = 0, in_num = 0; + unsigned int max = vq->vring.num; + unsigned int i = idx; + uint64_t read_len; + int rc; + + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + return -1; + } + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[i].addr); + desc_len = le32toh(desc[i].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(dev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(dev, desc_buf, desc_addr, desc_len)) + desc = desc_buf; + } + if (!desc) { + vu_panic(dev, "Invalid indirect buffer table"); + return -1; + } + i = 0; + } + + /* Collect all the descriptors */ + do { + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { + if (!virtqueue_map_desc(dev, &in_num, elem->in_sg, + elem->in_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } else { + if (in_num) { + vu_panic(dev, "Incorrect order for descriptors"); + return -1; + } + if (!virtqueue_map_desc(dev, &out_num, elem->out_sg, + elem->out_num, + le64toh(desc[i].addr), + le32toh(desc[i].len))) { + return -1; + } + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) { + vu_panic(dev, "Looped descriptor"); + return -1; + } + rc = virtqueue_read_next_desc(dev, desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) { + vu_panic(dev, "read descriptor error"); + return -1; + } + + elem->index = idx; + elem->in_num = in_num; + elem->out_num = out_num; + + return 0; +} + +/* cppcheck-suppress unusedFunction */ +int vu_queue_pop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem) +{ + unsigned int head; + int ret; + + if (dev->broken || !vq->vring.avail) + return -1; + + if (vu_queue_empty(dev, vq)) + return -1; + + /* + * Needed after virtio_queue_empty(), see comment in + * virtqueue_num_heads(). + */ + smp_rmb(); + + if (vq->inuse >= vq->vring.num) { + vu_panic(dev, "Virtqueue size exceeded"); + return -1; + } + + if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) + return -1; + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) + vring_set_avail_event(vq, vq->last_avail_idx); + + ret = vu_queue_map_desc(dev, vq, head, elem); + + if (ret < 0) + return ret; + + vq->inuse++; + + return 0; +} + +void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, + unsigned int index, size_t len) +{ + (void)dev; + (void)index; + (void)len; + + vq->inuse--; + /* unmap, when DMA support is added */ +} + +/* cppcheck-suppress unusedFunction */ +void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len) +{ + vq->last_avail_idx--; + vu_queue_detach_element(dev, vq, index, len); +} + +/* cppcheck-suppress unusedFunction */ +bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) +{ + (void)dev; + if (num > vq->inuse) + return false; + + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +static inline void vring_used_write(VuVirtq *vq, + const struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; +} + +void vu_queue_fill_by_index(const VuDev *dev, VuVirtq *vq, unsigned int index, + unsigned int len, unsigned int idx) +{ + struct vring_used_elem uelem; + + if (dev->broken || !vq->vring.avail) + return; + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = htole32(index); + uelem.len = htole32(len); + vring_used_write(vq, &uelem, idx); +} + +void vu_queue_fill(const VuDev *dev, VuVirtq *vq, const VuVirtqElement *elem, + unsigned int len, unsigned int idx) +{ + vu_queue_fill_by_index(dev, vq, elem->index, len, idx); +} + +static inline void vring_used_idx_set(VuVirtq *vq, uint16_t val) +{ + vq->vring.used->idx = htole16(val); + + vq->used_idx = val; +} + +void vu_queue_flush(const VuDev *dev, VuVirtq *vq, unsigned int count) +{ + uint16_t old, new; + + if (dev->broken || !vq->vring.avail) + return; + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(vq, new); + vq->inuse -= count; + if ((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) + vq->signalled_used_valid = false; +} + +/* cppcheck-suppress unusedFunction */ +void vu_queue_push(const VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, unsigned int len) +{ + vu_queue_fill(dev, vq, elem, len, 0); + vu_queue_flush(dev, vq, 1); +} + diff --git a/virtio.h b/virtio.h new file mode 100644 index 000000000000..61398bb432bc --- /dev/null +++ b/virtio.h @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// +/* come parts copied from QEMU subprojects/libvhost-user/libvhost-user.h */ + +#ifndef VIRTIO_H +#define VIRTIO_H + +#include <stdbool.h> +#include <linux/vhost_types.h> + +#define VIRTQUEUE_MAX_SIZE 1024 + +#define vu_panic(vdev, ...) \ + do { \ + (vdev)->broken = true; \ + err( __VA_ARGS__ ); \Wouldn't it be simpler to just use die() in place of vu_panic(). This is trying to keep the program running even if the vu device is broken, but if our channel to the guest is broken, I don't think passt is really worth saving.+ } while (0) + +typedef struct VuRing { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +} VuRing; + +typedef struct VuVirtq { + VuRing vring; + + /* Next head to pop */ + uint16_t last_avail_idx; + + /* Last avail_idx read from VQ. */ + uint16_t shadow_avail_idx; + + uint16_t used_idx; + + /* Last used index value we have signalled on */ + uint16_t signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + + bool notification; + + unsigned int inuse; + + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; + + /* Guest addresses of our ring */ + struct vhost_vring_addr vra; +} VuVirtq; + +typedef struct VuDevRegion { + uint64_t gpa; + uint64_t size; + uint64_t qva; + uint64_t mmap_offset; + uint64_t mmap_addr; +} VuDevRegion; + +#define VHOST_USER_MAX_QUEUES 2 + +/* + * Set a reasonable maximum number of ram slots, which will be supported by + * any architecture. + */ +#define VHOST_USER_MAX_RAM_SLOTS 32 + +typedef struct VuDev { + uint32_t nregions; + VuDevRegion regions[VHOST_USER_MAX_RAM_SLOTS]; + VuVirtq vq[VHOST_USER_MAX_QUEUES]; + uint64_t features; + uint64_t protocol_features; + bool broken; + int hdrlen; +} VuDev; + +typedef struct VuVirtqElement { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +} VuVirtqElement; + +static inline bool has_feature(uint64_t features, unsigned int fbit) +{ + return !!(features & (1ULL << fbit)); +} + +static inline bool vu_has_feature(const VuDev *vdev, unsigned int fbit) +{ + return has_feature(vdev->features, fbit); +} + +/* cppcheck-suppress unusedFunction */ +static inline bool vu_has_protocol_feature(const VuDev *vdev, unsigned int fbit) +{ + return has_feature(vdev->protocol_features, fbit); +} + +bool vu_queue_empty(const VuDev *dev, VuVirtq *vq); +void vu_queue_notify(VuDev *dev, VuVirtq *vq); +int vu_queue_pop(VuDev *dev, VuVirtq *vq, VuVirtqElement *elem); +void vu_queue_detach_element(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len); +void vu_queue_unpop(VuDev *dev, VuVirtq *vq, unsigned int index, size_t len); +bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num); + +void vu_queue_fill_by_index(const VuDev *dev, VuVirtq *vq, unsigned int index, + unsigned int len, unsigned int idx); +void vu_queue_fill(const VuDev *dev, VuVirtq *vq, const VuVirtqElement *elem, + unsigned int len, unsigned int idx); +void vu_queue_flush(const VuDev *dev, VuVirtq *vq, unsigned int count); +void vu_queue_push(const VuDev *dev, VuVirtq *vq, const VuVirtqElement *elem, + unsigned int len); +#endif /* VIRTIO_H */-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson