On Tue, Jan 28, 2025 at 07:50:01AM +0100, Stefano Brivio wrote:
On Tue, 28 Jan 2025 12:40:12 +1100
David Gibson <david(a)gibson.dropbear.id.au> wrote:
On Tue, Jan 28, 2025 at 12:15:31AM +0100, Stefano
Brivio wrote:
Add two sets (source or target) of three
functions each for passt in
vhost-user mode, triggered by activity on the file descriptor passed
via VHOST_USER_PROTOCOL_F_DEVICE_STATE:
- migrate_source_pre() and migrate_target_pre() are called to prepare
for migration, before data is transferred
- migrate_source() sends, and migrate_target() receives migration data
- migrate_source_post() and migrate_target_post() are responsible for
any post-migration task
Callbacks are added to these functions with arrays of function
pointers in migrate.c. Migration handlers are versioned.
Versioned descriptions of data sections will be added to the
data_versions array, which points to versioned iovec arrays. Version
1 is currently empty and will be filled in in subsequent patches.
The source announces the data version to be used and informs the peer
about endianness, and the size of void *, time_t, flow entries and
flow hash table entries.
The target checks if the version of the source is still supported. If
it's not, it aborts the migration.
Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com>
---
Makefile | 12 +--
migrate.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++++++
migrate.h | 90 ++++++++++++++++++
passt.c | 2 +-
vu_common.c | 122 ++++++++++++++++---------
vu_common.h | 2 +-
6 files changed, 438 insertions(+), 49 deletions(-)
create mode 100644 migrate.c
create mode 100644 migrate.h
diff --git a/Makefile b/Makefile
index 464eef1..1383875 100644
--- a/Makefile
+++ b/Makefile
@@ -38,8 +38,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
- ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
- tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
+ ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \
+ tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
vhost_user.c virtio.c vu_common.c
QRAP_SRCS = qrap.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
@@ -48,10 +48,10 @@ MANPAGES = passt.1 pasta.1 qrap.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
- lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
- siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
- tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \
- virtio.h vu_common.h
+ lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
+ pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \
+ tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \
+ vhost_user.h virtio.h vu_common.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/migrate.c b/migrate.c
new file mode 100644
index 0000000..bee9653
--- /dev/null
+++ b/migrate.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * migrate.c - Migration sections, layout, and routines
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio(a)redhat.com>
+ */
+
+#include <errno.h>
+#include <sys/uio.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "inany.h"
+#include "flow.h"
+#include "flow_table.h"
+
+#include "migrate.h"
+
+/* Current version of migration data */
+#define MIGRATE_VERSION 1
+
+/* Magic as we see it and as seen with reverse endianness */
+#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0
+#define MIGRATE_MAGIC_SWAPPED 0xB0D1B10B1B1DBBB1
As noted, I'm hoping we can get rid of "either endian" migration. But
if this stays, we should define it using __bswap_constant_32() to
avoid embarrassing mistakes.
Those always give me issues on musl,
What sort of issues? We're already using them, and have fallback
versions defined in util.h
so I'd rather test things on
big-endian and realise it's actually 0xB0D1B1B01B1DBBB1 (0x0b bitswap).
Feel free to post a different proposal if tested.
+
+/* Migration header to send from source */
+static union migrate_header header = {
+ .magic = MIGRATE_MAGIC,
+ .version = htonl_constant(MIGRATE_VERSION),
+ .time_t_size = htonl_constant(sizeof(time_t)),
+ .flow_size = htonl_constant(sizeof(union flow)),
+ .flow_sidx_size = htonl_constant(sizeof(struct flow_sidx)),
+ .voidp_size = htonl_constant(sizeof(void *)),
+};
+
+/* Data sections for version 1 */
+static struct iovec sections_v1[] = {
+ { &header, sizeof(header) },
+};
+
+/* Set of data versions */
+static struct migrate_data data_versions[] = {
+ {
+ 1, sections_v1,
+ },
+ { 0 },
+};
+
+/* Handlers to call in source before sending data */
+struct migrate_handler handlers_source_pre[] = {
+ { 0 },
+};
+
+/* Handlers to call in source after sending data */
+struct migrate_handler handlers_source_post[] = {
+ { 0 },
+};
+
+/* Handlers to call in target before receiving data with version 1 */
+struct migrate_handler handlers_target_pre_v1[] = {
+ { 0 },
+};
+
+/* Handlers to call in target after receiving data with version 1 */
+struct migrate_handler handlers_target_post_v1[] = {
+ { 0 },
+};
+
+/* Versioned sets of migration handlers */
+struct migrate_target_handlers target_handlers[] = {
+ {
+ 1,
+ handlers_target_pre_v1,
+ handlers_target_post_v1,
+ },
+ { 0 },
+};
+
+/**
+ * migrate_source_pre() - Pre-migration tasks as source
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ */
+int migrate_source_pre(struct migrate_meta *m)
+{
+ struct migrate_handler *h;
+
+ for (h = handlers_source_pre; h->fn; h++) {
+ int rc;
+
+ if ((rc = h->fn(m, h->data)))
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * migrate_source() - Perform migration as source: send state to hypervisor
+ * @fd: Descriptor for state transfer
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ */
+int migrate_source(int fd, const struct migrate_meta *m)
+{
+ static struct migrate_data *d;
+ unsigned count;
+ int rc;
+
+ for (d = data_versions; d->v != MIGRATE_VERSION; d++);
Should ASSERT() if we don't find the version within the array.
This looks a bit unnecessary, MIGRATE_VERSION is defined just above...
it's just a readability killer to me.
+ for
(count = 0; d->sections[count].iov_len; count++);
+
+ debug("Writing %u migration sections", count - 1 /* minus header */);
+ rc = write_remainder(fd, d->sections, count, 0);
+ if (rc < 0)
+ return errno;
+
+ return 0;
+}
+
+/**
+ * migrate_source_post() - Post-migration tasks as source
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ */
+void migrate_source_post(struct migrate_meta *m)
+{
+ struct migrate_handler *h;
+
+ for (h = handlers_source_post; h->fn; h++)
+ h->fn(m, h->data);
Is there actually anything we might need to do on the source after a
successful migration, other than exit?
We might want to log a couple of things, which would warrant these
handlers.
But let's say we need to do something *similar* to "updating the
network" such as the RARP announcement that QEMU is requesting (this is
IIUC, that's on the target end, not the source end...
intended for OVN-Kubernetes, so go figure), or that we
need a
workaround for a kernel issue with implicit close() with TCP_REPAIR
on... I would leave this in for completeness.
...but sure, point taken.
+}
+
+/**
+ * migrate_target_read_header() - Set metadata in target from source header
+ * @fd: Descriptor for state transfer
+ * @m: Migration metadata, filled on return
+ *
+ * Return: 0 on success, error code on failure
We nearly always use negative error codes. Why not here?
Because the reply to VHOST_USER_SET_DEVICE_STATE_FD is unsigned:
https://qemu-project.gitlab.io/qemu/interop/vhost-user.html#front-end-messa…
and I want to keep this consistent/untranslated.
Ok.
+ */
+int migrate_target_read_header(int fd, struct migrate_meta *m)
+{
+ static struct migrate_data *d;
+ union migrate_header h;
+
+ if (read_all_buf(fd, &h, sizeof(h)))
+ return errno;
+
+ debug("Source magic: 0x%016" PRIx64 ", sizeof(void *): %u, version:
%u",
+ h.magic, ntohl(h.voidp_size), ntohl(h.version));
+
+ for (d = data_versions; d->v != ntohl(h.version); d++);
+ if (!d->v)
+ return ENOTSUP;
This is too late. The loop doesn't check it, so you've already
overrun the data_versions table if the version wasn't in there.
Ah, yes, I forgot the '&& d->v' part (see migrate_target()).
Easier to use an ARRAY_SIZE() limit in the loop,
I think.
I'd rather keep that as a one-liner, and NULL-terminate the arrays.
+ m->v
= d->v;
+
+ if (h.magic == MIGRATE_MAGIC)
+ m->bswap = false;
+ else if (h.magic == MIGRATE_MAGIC_SWAPPED)
+ m->bswap = true;
+ else
+ return ENOTSUP;
+
+ if (ntohl(h.voidp_size) == 4)
+ m->source_64b = false;
+ else if (ntohl(h.voidp_size) == 8)
+ m->source_64b = true;
+ else
+ return ENOTSUP;
+
+ if (ntohl(h.time_t_size) == 4)
+ m->time_64b = false;
+ else if (ntohl(h.time_t_size) == 8)
+ m->time_64b = true;
+ else
+ return ENOTSUP;
+
+ m->flow_size = ntohl(h.flow_size);
+ m->flow_sidx_size = ntohl(h.flow_sidx_size);
+
+ return 0;
+}
+
+/**
+ * migrate_target_pre() - Pre-migration tasks as target
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ */
+int migrate_target_pre(struct migrate_meta *m)
+{
+ struct migrate_target_handlers *th;
+ struct migrate_handler *h;
+
+ for (th = target_handlers; th->v != m->v && th->v; th++);
+
+ for (h = th->pre; h->fn; h++) {
+ int rc;
+
+ if ((rc = h->fn(m, h->data)))
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * migrate_target() - Perform migration as target: receive state from hypervisor
+ * @fd: Descriptor for state transfer
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ *
+ * #syscalls:vu readv
+ */
+int migrate_target(int fd, const struct migrate_meta *m)
+{
+ static struct migrate_data *d;
+ unsigned cnt;
+ int rc;
+
+ for (d = data_versions; d->v != m->v && d->v; d++);
+
+ for (cnt = 0; d->sections[cnt + 1 /* skip header */].iov_len; cnt++);
+
+ debug("Reading %u migration sections", cnt);
+ rc = read_remainder(fd, d->sections + 1, cnt, 0);
+ if (rc < 0)
+ return errno;
+
+ return 0;
+}
+
+/**
+ * migrate_target_post() - Post-migration tasks as target
+ * @m: Migration metadata
+ */
+void migrate_target_post(struct migrate_meta *m)
+{
+ struct migrate_target_handlers *th;
+ struct migrate_handler *h;
+
+ for (th = target_handlers; th->v != m->v && th->v; th++);
+
+ for (h = th->post; h->fn; h++)
+ h->fn(m, h->data);
+}
diff --git a/migrate.h b/migrate.h
new file mode 100644
index 0000000..5582f75
--- /dev/null
+++ b/migrate.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio(a)redhat.com>
+ */
+
+#ifndef MIGRATE_H
+#define MIGRATE_H
+
+/**
+ * struct migrate_meta - Migration metadata
+ * @v: Chosen migration data version, host order
+ * @bswap: Source has opposite endianness
+ * @peer_64b: Source uses 64-bit void *
+ * @time_64b: Source uses 64-bit time_t
+ * @flow_size: Size of union flow in source
+ * @flow_sidx_size: Size of struct flow_sidx in source
+ */
+struct migrate_meta {
+ uint32_t v;
+ bool bswap;
+ bool source_64b;
+ bool time_64b;
+ size_t flow_size;
+ size_t flow_sidx_size;
+};
+
+/**
+ * union migrate_header - Migration header from source
+ * @magic: 0xB1BB1D1B0BB1D1B0, host order
+ * @version: Source sends highest known, target aborts if unsupported
+ * @voidp_size: sizeof(void *), network order
+ * @time_t_size: sizeof(time_t), network order
+ * @flow_size: sizeof(union flow), network order
+ * @flow_sidx_size: sizeof(struct flow_sidx_t), network order
+ * @unused: Go figure
+ */
+union migrate_header {
+ struct {
+ uint64_t magic;
+ uint32_t version;
+ uint32_t voidp_size;
+ uint32_t time_t_size;
+ uint32_t flow_size;
+ uint32_t flow_sidx_size;
+ };
+ uint8_t unused[65536];
So, having looked at this, I no longer think padding the header to 64kiB
is a good idea. The structure means we're basically stuck always
having that chunky header. Instead, I think the header should be
absolutely minimal: basically magic and version only. v1 (and maybe
others) can add a "metadata" or whatever section for additional
information like this they need.
The header is processed by the target in a separate, preliminary step,
though.
That's why I added metadata right in the header: if the target needs to
abort the migration because, say, the size of a flow entry is too big
to handle for a particular version, then we should know that before
migrate_target_pre().
Ah, yes, I missed that, we'd need a more complex design to do
additional transfers and checks before making the target_pre
callbacks.
As long as we check the version first, we can always
shrink the header
later on.
*thinks*.. I guess so, though it's kind of awkward; a future version
would have to read the "header of the header", check the version, then
if it's the old one, read the remainder of the 64kiB block.
I still think we should clearly separate the part that we're
committing to being in every future version (which I think should just
be magic and version), from the stuff that's just v1.
But having 64 KiB reserved looks more robust because
it's a
safe place to add this kind of metadata.
Note that 64 KiB is typically transferred in a single read/write
from/to the vhost-user back-end.
Ok, but it also has to go over the qemu migration channel, which will
often be a physical link, not a super-fast local/virtual one, and may
be bandwidth capped as well. I'm not actually certain if 64kiB is
likely to be a problem there, but it *is* large compared to the state
blobs of most qemu devices (usually only a few hundred bytes).
+};
+
+/**
+ * struct migrate_data - Data sections for given source version
+ * @v: Source version this applies to, host order
+ * @sections: Array of data sections, NULL-terminated
+ */
+struct migrate_data {
+ uint32_t v;
+ struct iovec *sections;
+};
+
+/**
+ * struct migrate_handler - Function to handle a specific data section
+ * @fn: Function pointer taking pointer to data section
+ * @data: Associated data section
+ */
+struct migrate_handler {
+ int (*fn)(struct migrate_meta *m, void *data);
+ void *data;
+};
+
+/**
+ * struct migrate_target_handlers - Versioned sets of migration target handlers
+ * @v: Source version this applies to, host order
+ * @pre: Set of functions to execute in target before data copy
+ * @post: Set of functions to execute in target after data copy
+ */
+struct migrate_target_handlers {
+ uint32_t v;
+ struct migrate_handler *pre;
+ struct migrate_handler *post;
+};
+
+int migrate_source_pre(struct migrate_meta *m);
+int migrate_source(int fd, const struct migrate_meta *m);
+void migrate_source_post(struct migrate_meta *m);
+
+int migrate_target_read_header(int fd, struct migrate_meta *m);
+int migrate_target_pre(struct migrate_meta *m);
+int migrate_target(int fd, const struct migrate_meta *m);
+void migrate_target_post(struct migrate_meta *m);
+
+#endif /* MIGRATE_H */
diff --git a/passt.c b/passt.c
index b1c8ab6..184d4e5 100644
--- a/passt.c
+++ b/passt.c
@@ -358,7 +358,7 @@ loop:
vu_kick_cb(c.vdev, ref, &now);
break;
case EPOLL_TYPE_VHOST_MIGRATION:
- vu_migrate(c.vdev, eventmask);
+ vu_migrate(&c, eventmask);
break;
default:
/* Can't happen */
diff --git a/vu_common.c b/vu_common.c
index f43d8ac..0c67bd0 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -5,6 +5,7 @@
* common_vu.c - vhost-user common UDP and TCP functions
*/
+#include <errno.h>
#include <unistd.h>
#include <sys/uio.h>
#include <sys/eventfd.h>
@@ -17,6 +18,7 @@
#include "vhost_user.h"
#include "pcap.h"
#include "vu_common.h"
+#include "migrate.h"
#define VU_MAX_TX_BUFFER_NB 2
@@ -305,50 +307,88 @@ err:
}
/**
- * vu_migrate() - Send/receive passt insternal state to/from QEMU
- * @vdev: vhost-user device
+ * vu_migrate_source() - Migration as source, send state to hypervisor
+ * @fd: File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+static int vu_migrate_source(int fd)
+{
+ struct migrate_meta m;
+ int rc;
+
+ if ((rc = migrate_source_pre(&m))) {
+ err("Source pre-migration failed: %s, abort", strerror_(rc));
+ return rc;
+ }
+
+ debug("Saving backend state");
+
+ rc = migrate_source(fd, &m);
+ if (rc)
+ err("Source migration failed: %s", strerror_(rc));
+ else
+ migrate_source_post(&m);
+
+ return rc;
After a successful source migration shouldn't we exit, or at least
quiesce ourselves so we don't accidentally mess with anything the
target is now doing?
Maybe, yes. Pending TCP connections should be safe because with
TCP_REPAIR they're already quiesced, but we don't close listening
sockets (yet).
Perhaps a reasonable approach for the moment would be to declare a
single migrate_source_post handler logging a info() message and
exiting.
Seems sensible for now.
+}
+
+/**
+ * vu_migrate_target() - Migration as target, receive state from hypervisor
+ * @fd: File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+static int vu_migrate_target(int fd)
+{
+ struct migrate_meta m;
+ int rc;
+
+ rc = migrate_target_read_header(fd, &m);
+ if (rc) {
+ err("Migration header check failed: %s, abort", strerror_(rc));
+ return rc;
+ }
+
+ if ((rc = migrate_target_pre(&m))) {
+ err("Target pre-migration failed: %s, abort", strerror_(rc));
+ return rc;
+ }
+
+ debug("Loading backend state");
+
+ rc = migrate_target(fd, &m);
+ if (rc)
+ err("Target migration failed: %s", strerror_(rc));
+ else
+ migrate_target_post(&m);
+
+ return rc;
+}
+
+/**
+ * vu_migrate() - Send/receive passt internal state to/from QEMU
+ * @c: Execution context
* @events: epoll events
*/
-void vu_migrate(struct vu_dev *vdev, uint32_t events)
+void vu_migrate(struct ctx *c, uint32_t events)
{
- int ret;
+ struct vu_dev *vdev = c->vdev;
+ int rc = EIO;
- /* TODO: collect/set passt internal state
- * and use vdev->device_state_fd to send/receive it
- */
debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
- if (events & EPOLLOUT) {
- debug("Saving backend state");
-
- /* send some stuff */
- ret = write(vdev->device_state_fd, "PASST", 6);
- /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
- vdev->device_state_result = ret == -1 ? -1 : 0;
- /* Closing the file descriptor signals the end of transfer */
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
- vdev->device_state_fd, NULL);
- close(vdev->device_state_fd);
- vdev->device_state_fd = -1;
- } else if (events & EPOLLIN) {
- char buf[6];
-
- debug("Loading backend state");
- /* read some stuff */
- ret = read(vdev->device_state_fd, buf, sizeof(buf));
- /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
- if (ret != sizeof(buf)) {
- vdev->device_state_result = -1;
- } else {
- ret = strncmp(buf, "PASST", sizeof(buf));
- vdev->device_state_result = ret == 0 ? 0 : -1;
- }
- } else if (events & EPOLLHUP) {
- debug("Closing migration channel");
-
- /* The end of file signals the end of the transfer. */
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
- vdev->device_state_fd, NULL);
- close(vdev->device_state_fd);
- vdev->device_state_fd = -1;
- }
+
+ if (events & EPOLLOUT)
+ rc = vu_migrate_source(vdev->device_state_fd);
+ else if (events & EPOLLIN)
+ rc = vu_migrate_target(vdev->device_state_fd);
+
+ /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */
+
+ vdev->device_state_result = rc;
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, vdev->device_state_fd, NULL);
+ debug("Closing migration channel");
+ close(vdev->device_state_fd);
+ vdev->device_state_fd = -1;
}
diff --git a/vu_common.h b/vu_common.h
index d56c021..69c4006 100644
--- a/vu_common.h
+++ b/vu_common.h
@@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now);
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
-void vu_migrate(struct vu_dev *vdev, uint32_t events);
+void vu_migrate(struct ctx *c, uint32_t events);
#endif /* VU_COMMON_H */
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson