No users enable vhost right now, just defining the functions.
The use of virtqueue is similar than in rx case. fills the descriptor
table with packet data it wants to send to the namespace. Each
descriptor points to a buffer in memory, with an address and a length.
The number of descriptors is again defined by VHOST_NDESCS.
Afterwards it writes the descriptor index into the avail->ring[] array,
then increments avail->idx to make it visible to the kernel, then kicks
the virtqueue 1 event fd.
When the kernel does not need the buffer anymore it writes its id into
the used_ring->ring[], and increments used_ring->idx. Normally, the
kernel also notifies pasta through call eventfd of the virtqueue 1.
But we don't monitor the eventfd. Instead, we check if we can reuse the
buffers or not just when we produce, making the code simpler and more
performant.
Like on the rx path, we assume descriptors are used in the same order
they were made available. This is also consistent with behavior seen in
QEMU's virtio-net implementation.
Signed-off-by: Eugenio Pérez
---
arp.c | 2 +-
tap.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++--------
tap.h | 4 +--
tcp.c | 2 +-
tcp_buf.c | 2 +-
udp.c | 2 +-
6 files changed, 79 insertions(+), 17 deletions(-)
diff --git a/arp.c b/arp.c
index fc482bb..ea786a0 100644
--- a/arp.c
+++ b/arp.c
@@ -80,7 +80,7 @@ int arp(const struct ctx *c, const struct pool *p)
memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest));
memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
- tap_send_single(c, eh, l2len);
+ tap_send_single(c, eh, l2len, false);
return 1;
}
diff --git a/tap.c b/tap.c
index 5667fbe..7ccac86 100644
--- a/tap.c
+++ b/tap.c
@@ -121,11 +121,19 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf);
static_assert(!(VHOST_NDESCS & (VHOST_NDESCS - 1)),
"Number of vhost descs must be a power of two by standard");
static struct {
+ /* Descriptor index we're using. This is not the same as avail idx in
+ * split: this takes into account the chained descs */
+ uint16_t vring_idx;
+
/* Number of free descriptors */
uint16_t num_free;
/* Last used idx processed */
uint16_t last_used_idx;
+
+ /* Descriptors in use */
+ /* desc info: number of descriptors in the chain */
+ uint16_t ndescs[VHOST_NDESCS];
} vqs[2];
static struct vring_desc vring_desc[2][VHOST_NDESCS] __attribute__((aligned(PAGE_SIZE)));
@@ -176,7 +184,7 @@ unsigned long tap_l2_max_len(const struct ctx *c)
* @data: Packet buffer
* @l2len: Total L2 packet length
*/
-void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
+void tap_send_single(const struct ctx *c, const void *data, size_t l2len, bool vhost)
{
uint32_t vnet_len = htonl(l2len);
struct iovec iov[2];
@@ -192,7 +200,7 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len)
iov[iovcnt].iov_len = l2len;
iovcnt++;
- tap_send_frames(c, iov, iovcnt, 1);
+ tap_send_frames(c, iov, iovcnt, 1, vhost);
break;
case MODE_VU:
vu_send_single(c, data, l2len);
@@ -314,7 +322,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
memcpy(data, in, dlen);
- tap_send_single(c, buf, dlen + (data - buf));
+ tap_send_single(c, buf, dlen + (data - buf), false);
}
/**
@@ -336,7 +344,7 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
memcpy(icmp4h, in, l4len);
csum_icmp4(icmp4h, icmp4h + 1, l4len - sizeof(*icmp4h));
- tap_send_single(c, buf, l4len + ((char *)icmp4h - buf));
+ tap_send_single(c, buf, l4len + ((char *)icmp4h - buf), false);
}
/**
@@ -421,7 +429,7 @@ void tap_udp6_send(const struct ctx *c,
char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
memcpy(data, in, dlen);
- tap_send_single(c, buf, dlen + (data - buf));
+ tap_send_single(c, buf, dlen + (data - buf), false);
}
/**
@@ -444,7 +452,7 @@ void tap_icmp6_send(const struct ctx *c,
memcpy(icmp6h, in, l4len);
csum_icmp6(icmp6h, src, dst, icmp6h + 1, l4len - sizeof(*icmp6h));
- tap_send_single(c, buf, l4len + ((char *)icmp6h - buf));
+ tap_send_single(c, buf, l4len + ((char *)icmp6h - buf), false);
}
static void vhost_kick(struct vring_used *used, int kick_fd) {
@@ -459,8 +467,9 @@ static void vhost_kick(struct vring_used *used, int kick_fd) {
eventfd_write(kick_fd, 1);
}
+
/**
- * tap_send_frames_pasta() - Send multiple frames to the pasta tap
+ * tap_send_frames_vhost() - Send multiple frames to the pasta tap
* @c: Execution context
* @iov: Array of buffers
* @bufs_per_frame: Number of buffers (iovec entries) per frame
@@ -470,16 +479,68 @@ static void vhost_kick(struct vring_used *used, int kick_fd) {
* @bufs_per_frame contiguous buffers representing a single frame.
*
* Return: number of frames successfully sent
+ */
+static size_t tap_send_frames_vhost(const struct ctx *c,
+ const struct iovec *iov,
+ size_t bufs_per_frame, size_t nframes)
+{
+ size_t i;
+
+ for (i = 0; i < nframes; i++) {
+ size_t j;
+
+ if (vqs[1].num_free < bufs_per_frame)
+ return i;
+
+ vring_avail_1.avail.ring[(vring_avail_1.avail.idx + i) % VHOST_NDESCS] = htole16(vqs[1].vring_idx) % VHOST_NDESCS;
+ vqs[1].ndescs[(vring_avail_1.avail.idx + i) % VHOST_NDESCS] = bufs_per_frame;
+ vqs[1].num_free -= bufs_per_frame;
+
+ for (j = 0; j < bufs_per_frame; ++j) {
+ struct vring_desc *desc = &vring_desc[1][vqs[1].vring_idx % VHOST_NDESCS];
+ const struct iovec *iov_i = &iov[i * bufs_per_frame + j];
+
+ desc->addr = (uint64_t)iov_i->iov_base;
+ desc->len = iov_i->iov_len;
+ desc->flags = (j == bufs_per_frame - 1) ? 0 : htole16(VRING_DESC_F_NEXT);
+ vqs[1].vring_idx++;
+ }
+ }
+
+ smp_wmb();
+ vring_avail_1.avail.idx = htole16(le16toh(vring_avail_1.avail.idx) + nframes);
+
+ vhost_kick(&vring_used_1.used, c->vq[1].kick_fd);
+
+ return nframes;
+}
+
+
+/**
+ * tap_send_frames_pasta() - Send multiple frames to the pasta tap
+ * @c: Execution context
+ * @iov: Array of buffers
+ * @bufs_per_frame: Number of buffers (iovec entries) per frame
+ * @nframes: Number of frames to send
+ * @vhost: Use vhost-kernel or not
+ *
+ * @iov must have total length @bufs_per_frame * @nframes, with each set of
+ * @bufs_per_frame contiguous buffers representing a single frame.
+ *
+ * Return: number of frames successfully sent (or queued)
*
* #syscalls:pasta write
*/
static size_t tap_send_frames_pasta(const struct ctx *c,
const struct iovec *iov,
- size_t bufs_per_frame, size_t nframes)
+ size_t bufs_per_frame, size_t nframes, bool vhost)
{
size_t nbufs = bufs_per_frame * nframes;
size_t i;
+ if (vhost)
+ return tap_send_frames_vhost(c, iov, bufs_per_frame, nframes);
+
for (i = 0; i < nbufs; i += bufs_per_frame) {
ssize_t rc = writev(c->fd_tap, iov + i, bufs_per_frame);
size_t framelen = iov_size(iov + i, bufs_per_frame);
@@ -563,14 +624,15 @@ static size_t tap_send_frames_passt(const struct ctx *c,
* @iov: Array of buffers, each containing one frame (with L2 headers)
* @bufs_per_frame: Number of buffers (iovec entries) per frame
* @nframes: Number of frames to send
+ * @vhost: Use vhost-kernel or not
*
* @iov must have total length @bufs_per_frame * @nframes, with each set of
* @bufs_per_frame contiguous buffers representing a single frame.
*
- * Return: number of frames actually sent
+ * Return: number of frames actually sent (or queued)
*/
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
- size_t bufs_per_frame, size_t nframes)
+ size_t bufs_per_frame, size_t nframes, bool vhost)
{
size_t m;
@@ -579,7 +641,7 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
switch (c->mode) {
case MODE_PASTA:
- m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes);
+ m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes, vhost);
break;
case MODE_PASST:
m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
diff --git a/tap.h b/tap.h
index ff8cee5..e924dfb 100644
--- a/tap.h
+++ b/tap.h
@@ -111,9 +111,9 @@ void tap_udp6_send(const struct ctx *c,
void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst,
const void *in, size_t l4len);
-void tap_send_single(const struct ctx *c, const void *data, size_t l2len);
+void tap_send_single(const struct ctx *c, const void *data, size_t l2len, bool vhost);
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
- size_t bufs_per_frame, size_t nframes);
+ size_t bufs_per_frame, size_t nframes, bool vhost);
void eth_update_mac(struct ethhdr *eh,
const unsigned char *eth_d, const unsigned char *eth_s);
void tap_listen_handler(struct ctx *c, uint32_t events);
diff --git a/tcp.c b/tcp.c
index f43c1e2..05f5b4c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1935,7 +1935,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
tcp_update_csum(psum, rsth, &payload);
rst_l2len = ((char *)rsth - buf) + sizeof(*rsth);
- tap_send_single(c, buf, rst_l2len);
+ tap_send_single(c, buf, rst_l2len, false);
}
/**
diff --git a/tcp_buf.c b/tcp_buf.c
index 6d79d67..242086d 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -141,7 +141,7 @@ void tcp_payload_flush(const struct ctx *c)
size_t m;
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
- tcp_payload_used);
+ tcp_payload_used, false);
if (m != tcp_payload_used) {
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
tcp_payload_used - m);
diff --git a/udp.c b/udp.c
index 65a52e0..d017d99 100644
--- a/udp.c
+++ b/udp.c
@@ -809,7 +809,7 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
for (i = 0; i < n; i++)
udp_tap_prepare(udp_mh_recv, i, toside, false);
- tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
+ tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n, false);
}
/**
--
2.50.0