Re: [PATCH v3 3/4] vhost-user: introduce vhost-user API

27 Aug 2024


      On Mon, 26 Aug 2024 15:26:44 +1000
David Gibson  wrote:
...
On Thu, Aug 15, 2024 at 05:50:22PM +0200, Laurent Vivier wrote:
...
Add vhost_user.c and vhost_user.h that define the functions needed
to implement vhost-user backend.
[...]
+static int vu_message_read_default(int conn_fd, struct vhost_user_msg *vmsg)
+{
+	char control[CMSG_SPACE(VHOST_MEMORY_BASELINE_NREGIONS *
+		     sizeof(int))] = { 0 };
+	struct iovec iov = {
+		.iov_base = (char *)vmsg,
+		.iov_len = VHOST_USER_HDR_SIZE,
+	};
+	struct msghdr msg = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = control,
+		.msg_controllen = sizeof(control),
+	};
+	ssize_t ret, sz_payload;
+	struct cmsghdr *cmsg;
+	size_t fd_size;
+
+	ret = recvmsg(conn_fd, &msg, MSG_DONTWAIT);
+	if (ret < 0) {
+		if (errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
+			return 0;
+		return -1;
+	}
+
+	vmsg->fd_num = 0;
+	for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
+	     cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+		if (cmsg->cmsg_level == SOL_SOCKET &&
+		    cmsg->cmsg_type == SCM_RIGHTS) {
+			fd_size = cmsg->cmsg_len - CMSG_LEN(0);
+			ASSERT(fd_size / sizeof(int) <=
+			       VHOST_MEMORY_BASELINE_NREGIONS);
IIUC, this could be tripped by a bug in the peer (qemu?) rather than
in our own code.  In which case I think a die() would be more
appropriate than an ASSERT().
Ah, right, it wouldn't be our issue... what about neither, so that we
don't crash if QEMU has an issue we could easily recover from?
...
...
[...]
+/**
+ * vu_set_mem_table_exec() - Sets the memory map regions to be able to
+ * 			     translate the vring addresses.
+ * @vdev:	Vhost-user device
+ * @vmsg:	Vhost-user message
+ *
+ * Return: false as no reply is requested
+ *
+ * #syscalls:vu mmap munmap
+ */
+static bool vu_set_mem_table_exec(struct vu_dev *vdev,
+				  struct vhost_user_msg *msg)
+{
+	struct vhost_user_memory m = msg->payload.memory, *memory = &m;
Is there a reason to take a copy of the message, rather than just
referencing into msg as passed?
...
+	unsigned int i;
+
+	for (i = 0; i < vdev->nregions; i++) {
+		struct vu_dev_region *r = &vdev->regions[i];
+		/* NOLINTNEXTLINE(performance-no-int-to-ptr) */
+		void *mm = (void *)r->mmap_addr;
+
+		if (mm)
+			munmap(mm, r->size + r->mmap_offset);
Do we actually ever need to change the mapping of the regions?  If not
we can avoid this unmapping loop.
...
+	}
+	vdev->nregions = memory->nregions;
+
+	debug("Nregions: %u", memory->nregions);
+	for (i = 0; i < vdev->nregions; i++) {
+		struct vhost_user_memory_region *msg_region = &memory->regions[i];
+		struct vu_dev_region *dev_region = &vdev->regions[i];
+		void *mmap_addr;
+
+		debug("Region %d", i);
+		debug("    guest_phys_addr: 0x%016"PRIx64,
+		      msg_region->guest_phys_addr);
+		debug("    memory_size:     0x%016"PRIx64,
+		      msg_region->memory_size);
+		debug("    userspace_addr   0x%016"PRIx64,
+		      msg_region->userspace_addr);
+		debug("    mmap_offset      0x%016"PRIx64,
+		      msg_region->mmap_offset);
+
+		dev_region->gpa = msg_region->guest_phys_addr;
+		dev_region->size = msg_region->memory_size;
+		dev_region->qva = msg_region->userspace_addr;
+		dev_region->mmap_offset = msg_region->mmap_offset;
+
+		/* We don't use offset argument of mmap() since the
+		 * mapped address has to be page aligned, and we use huge
+		 * pages.
We do what now?
We do madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE) in main(), but
we're not using pkt_buf in this case, so I guess it's not relevant. I'm
not sure if _passt_ calling madvise(..., MADV_HUGEPAGE) on the memory
regions we get would have any effect, by the way.
...
...
[...]
+/**
+ * vu_send() - Send a buffer to the front-end using the RX virtqueue
+ * @vdev:	vhost-user device
+ * @buf:	address of the buffer
+ * @size:	size of the buffer
+ *
+ * Return: number of bytes sent, -1 if there is an error
+ */
+/* cppcheck-suppress unusedFunction */
+int vu_send(struct vu_dev *vdev, const void *buf, size_t size)
+{
+	struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
+	struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
+	size_t lens[VIRTQUEUE_MAX_SIZE];
+	__virtio16 *num_buffers_ptr = NULL;
+	size_t hdrlen = vdev->hdrlen;
+	int in_sg_count = 0;
+	size_t offset = 0;
+	int i = 0, j;
+
+	debug("vu_send size %zu hdrlen %zu", size, hdrlen);
+
+	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
+		err("Got packet, but no available descriptors on RX virtq.");
+		return 0;
+	}
+
+	while (offset < size) {
+		size_t len;
+		int total;
+		int ret;
+
+		total = 0;
+
+		if (i == ARRAY_SIZE(elem) ||
+		    in_sg_count == ARRAY_SIZE(in_sg)) {
+			err("virtio-net unexpected long buffer chain");
+			goto err;
+		}
+
+		elem[i].out_num = 0;
+		elem[i].out_sg = NULL;
+		elem[i].in_num = ARRAY_SIZE(in_sg) - in_sg_count;
+		elem[i].in_sg = &in_sg[in_sg_count];
+
+		ret = vu_queue_pop(vdev, vq, &elem[i]);
+		if (ret < 0) {
+			if (vu_wait_queue(vq) != -1)
+				continue;
+			if (i) {
+				err("virtio-net unexpected empty queue: "
+				    "i %d mergeable %d offset %zd, size %zd, "
+				    "features 0x%" PRIx64,
+				    i, vu_has_feature(vdev,
+						      VIRTIO_NET_F_MRG_RXBUF),
+				    offset, size, vdev->features);
+			}
+			offset = -1;
+			goto err;
+		}
+		in_sg_count += elem[i].in_num;
Initially I thought this would consume the entire in_sg array on the
first loop iteration, but I guess vu_queue_pop() reduces in_num from
the value we initialise above.
...
+		if (elem[i].in_num < 1) {
I realise it doesn't really matter in this context, but it makes more
sense to me for this check to go _before_ we use in_num to update
in_sg_cuont.
...
+			err("virtio-net receive queue contains no in buffers");
+			vu_queue_detach_element(vq);
+			offset = -1;
+			goto err;
+		}
+
+		if (i == 0) {
+			struct virtio_net_hdr hdr = {
+				.flags = VIRTIO_NET_HDR_F_DATA_VALID,
+				.gso_type = VIRTIO_NET_HDR_GSO_NONE,
+			};
+
+			ASSERT(offset == 0);
+			ASSERT(elem[i].in_sg[0].iov_len >= hdrlen);
Is this necessarily our bug, or could it be cause by the peer giving
unreasonably small buffers?  If the latter, then a die() would make
more sense.
...same here.

-- 
Stefano

Re: [PATCH v3 3/4] vhost-user: introduce vhost-user API

Stefano Brivio