On Wed, Feb 05, 2025 at 01:38:59AM +0100, Stefano Brivio wrote:Add migration facilities based on top of the current vhost-user infrastructure, moving vu_migrate() to migrate.c. Versioned migration stages define function pointers to be called on source or target, or data sections that need to be transferred. The migration header consists of a magic number and a version identifier. Co-authored-by: David Gibson <david(a)gibson.dropbear.id.au>Given this, it should also have my S-o-b, Signed-off-by: David Gibson <david(a)gibson.dropbear.id.au> And, given that we already have an awkward co-authorship situation, it probably makes sense to fold patches 2 & 3 into this one.Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- Makefile | 12 +-- migrate.c | 210 ++++++++++++++++++++++++++++++++++++++++++++++++++++ migrate.h | 51 +++++++++++++ passt.c | 2 +- util.h | 26 +++++++ vu_common.c | 58 +++++---------- vu_common.h | 2 +- 7 files changed, 315 insertions(+), 46 deletions(-) create mode 100644 migrate.c create mode 100644 migrate.h diff --git a/Makefile b/Makefile index d3d4b78..be89b07 100644 --- a/Makefile +++ b/Makefile @@ -38,8 +38,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ - ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ + ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \ + tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c PASST_REPAIR_SRCS = passt-repair.c @@ -49,10 +49,10 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ - lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ - siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \ - virtio.h vu_common.h + lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \ + pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \ + tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \ + vhost_user.h virtio.h vu_common.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);} diff --git a/migrate.c b/migrate.c new file mode 100644 index 0000000..a7031f9 --- /dev/null +++ b/migrate.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * migrate.c - Migration sections, layout, and routines + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#include <errno.h> +#include <sys/uio.h> + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "inany.h" +#include "flow.h" +#include "flow_table.h" + +#include "migrate.h" + +/* Current version of migration data */ +#define MIGRATE_VERSION 1 + +/* Magic identifier for migration data */ +#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 + +/* Migration header to send from source */ +static struct migrate_header header = { + .magic = htonll_constant(MIGRATE_MAGIC), + .version = htonl_constant(MIGRATE_VERSION), +}; + +/** + * migrate_send_block() - Migration stage handler to send verbatim data + * @c: Execution context + * @stage: Migration stage + * @fd: Migration fd + * + * Sends the buffer in @stage->iov over the migration channel. + */ +__attribute__((__unused__)) +static int migrate_send_block(struct ctx *c, + const struct migrate_stage *stage, int fd) +{ + (void)c; + + if (write_remainder(fd, &stage->iov, 1, 0) < 0) + return errno; + + return 0; +} + +/** + * migrate_recv_block() - Migration stage handler to receive verbatim data + * @c: Execution context + * @stage: Migration stage + * @fd: Migration fd + * + * Reads the buffer in @stage->iov from the migration channel. + * + * #syscalls:vu readv + */ +__attribute__((__unused__)) +static int migrate_recv_block(struct ctx *c, + const struct migrate_stage *stage, int fd) +{ + (void)c; + + if (read_remainder(fd, &stage->iov, 1, 0) < 0) + return errno; + + return 0; +} + +#define DATA_STAGE(v) \ + { \ + .name = #v, \ + .source = migrate_send_block, \ + .target = migrate_recv_block, \ + .iov = { &(v), sizeof(v) }, \ + } + +/* Stages for version 1 */ +static const struct migrate_stage stages_v1[] = { + { + .name = "flow pre", + .target = NULL, + }, + { + .name = "flow post", + .source = NULL, + }, + { 0 }, +}; + +/* Set of data versions */ +static const struct migrate_version versions[] = { + { + 1, stages_v1, + }, + { 0 }, +}; + +/** + * migrate_source() - Migration as source, send state to hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +int migrate_source(struct ctx *c, int fd) +{ + const struct migrate_version *v = versions + ARRAY_SIZE(versions) - 1; + const struct migrate_stage *s; + int ret; + + ret = write_all_buf(fd, &header, sizeof(header)); + if (ret) { + err("Can't send migration header: %s, abort", strerror_(ret)); + return ret; + } + + for (s = v->s; *s->name; s++) { + if (!s->source) + continue; + + debug("Source side migration: %s", s->name); + + if ((ret = s->source(c, s, fd))) { + err("Source migration stage %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} + +/** + * migrate_target_read_header() - Read header in target + * @fd: Descriptor for state transfer + * + * Return: version number on success, 0 on failure with errno set + */ +static uint32_t migrate_target_read_header(int fd) +{ + struct migrate_header h; + + if (read_all_buf(fd, &h, sizeof(h))) + return 0; + + debug("Source magic: 0x%016" PRIx64 ", version: %u", + be64toh(h.magic), ntohl_constant(h.version)); + + if (ntohll_constant(h.magic) != MIGRATE_MAGIC || !ntohl(h.version)) { + errno = EINVAL; + return 0; + } + + return ntohl(h.version); +} + +/** + * migrate_target() - Migration as target, receive state from hypervisor + * @c: Execution context + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +int migrate_target(struct ctx *c, int fd) +{ + const struct migrate_version *v; + const struct migrate_stage *s; + uint32_t id; + int ret; + + id = migrate_target_read_header(fd); + if (!id) { + ret = errno; + err("Migration header check failed: %s, abort", strerror_(ret)); + return ret; + } + + for (v = versions; v->id && v->id == id; v++); + if (!v->id) { + err("Unsupported version: %u", id); + return -ENOTSUP; + } + + for (s = v->s; *s->name; s++) { + if (!s->target) + continue; + + debug("Target side migration: %s", s->name); + + if ((ret = s->target(c, s, fd))) { + err("Target migration stage %s: %s, abort", s->name, + strerror_(ret)); + return ret; + } + } + + return 0; +} diff --git a/migrate.h b/migrate.h new file mode 100644 index 0000000..3093b6e --- /dev/null +++ b/migrate.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef MIGRATE_H +#define MIGRATE_H + +/** + * struct migrate_header - Migration header from source + * @magic: 0xB1BB1D1B0BB1D1B0, network order + * @version: Highest known, target aborts if too old, network order + */ +struct migrate_header { + uint64_t magic; + uint32_t version; +} __attribute__((packed)); + +/** + * struct migrate_stage - Callbacks and parameters for one stage of migration + * @name: Stage name (for debugging) + * @source: Callback to implement this stage on the source + * @target: Callback to implement this stage on the target + * @iov: Optional data section to transfer + */ +struct migrate_stage { + const char *name; + int (*source)(struct ctx *c, + const struct migrate_stage *stage, int fd); + int (*target)(struct ctx *c, + const struct migrate_stage *stage, int fd); + + /* FIXME: rollback callbacks? */ + + struct iovec iov; +}; + +/** + * struct migrate_version - Stages for a particular protocol version + * @id: Version number, host order + * @s: Ordered array of stages, NULL-terminated + */ +struct migrate_version { + uint32_t id; + const struct migrate_stage *s; +}; + +int migrate_source(struct ctx *c, int fd); +int migrate_target(struct ctx *c, int fd); + +#endif /* MIGRATE_H */ diff --git a/passt.c b/passt.c index b1c8ab6..184d4e5 100644 --- a/passt.c +++ b/passt.c @@ -358,7 +358,7 @@ loop: vu_kick_cb(c.vdev, ref, &now); break; case EPOLL_TYPE_VHOST_MIGRATION: - vu_migrate(c.vdev, eventmask); + vu_migrate(&c, eventmask); break; default: /* Can't happen */ diff --git a/util.h b/util.h index 23b165c..1aed629 100644 --- a/util.h +++ b/util.h @@ -122,12 +122,38 @@ (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) #endif +#ifndef __bswap_constant_32 +#define __bswap_constant_32(x) \ + ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \ + (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24)) +#endif + +#ifndef __bswap_constant_64 +#define __bswap_constant_64(x) \ + ((((x) & 0xff00000000000000ULL) >> 56) | \ + (((x) & 0x00ff000000000000ULL) >> 40) | \ + (((x) & 0x0000ff0000000000ULL) >> 24) | \ + (((x) & 0x000000ff00000000ULL) >> 8) | \ + (((x) & 0x00000000ff000000ULL) << 8) | \ + (((x) & 0x0000000000ff0000ULL) << 24) | \ + (((x) & 0x000000000000ff00ULL) << 40) | \ + (((x) & 0x00000000000000ffULL) << 56)) +#endif + #if __BYTE_ORDER == __BIG_ENDIAN #define htons_constant(x) (x) #define htonl_constant(x) (x) +#define htonll_constant(x) (x) +#define ntohs_constant(x) (x) +#define ntohl_constant(x) (x) +#define ntohll_constant(x) (x) #else #define htons_constant(x) (__bswap_constant_16(x)) #define htonl_constant(x) (__bswap_constant_32(x)) +#define htonll_constant(x) (__bswap_constant_64(x)) +#define ntohs_constant(x) (__bswap_constant_16(x)) +#define ntohl_constant(x) (__bswap_constant_32(x)) +#define ntohll_constant(x) (__bswap_constant_64(x)) #endif /** diff --git a/vu_common.c b/vu_common.c index ab04d31..3d41824 100644 --- a/vu_common.c +++ b/vu_common.c @@ -5,6 +5,7 @@ * common_vu.c - vhost-user common UDP and TCP functions */ +#include <errno.h> #include <unistd.h> #include <sys/uio.h> #include <sys/eventfd.h> @@ -17,6 +18,7 @@ #include "vhost_user.h" #include "pcap.h" #include "vu_common.h" +#include "migrate.h" #define VU_MAX_TX_BUFFER_NB 2 @@ -305,48 +307,28 @@ err: } /** - * vu_migrate() - Send/receive passt insternal state to/from QEMU - * @vdev: vhost-user device + * vu_migrate() - Send/receive passt internal state to/from QEMU + * @c: Execution context * @events: epoll events */ -void vu_migrate(struct vu_dev *vdev, uint32_t events) +void vu_migrate(struct ctx *c, uint32_t events) { - int ret; + struct vu_dev *vdev = c->vdev; + int rc = EIO; - /* TODO: collect/set passt internal state - * and use vdev->device_state_fd to send/receive it - */ debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); - if (events & EPOLLOUT) { - debug("Saving backend state"); - - /* send some stuff */ - ret = write(vdev->device_state_fd, "PASST", 6); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - vdev->device_state_result = ret == -1 ? -1 : 0; - /* Closing the file descriptor signals the end of transfer */ - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } else if (events & EPOLLIN) { - char buf[6]; - - debug("Loading backend state"); - /* read some stuff */ - ret = read(vdev->device_state_fd, buf, sizeof(buf)); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - if (ret != sizeof(buf)) { - vdev->device_state_result = -1; - } else { - ret = strncmp(buf, "PASST", sizeof(buf)); - vdev->device_state_result = ret == 0 ? 0 : -1; - } - } else if (events & EPOLLHUP) { - debug("Closing migration channel"); - /* The end of file signals the end of the transfer. */ - epoll_del(vdev->context, vdev->device_state_fd); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } + if (events & EPOLLOUT) + rc = migrate_source(c, vdev->device_state_fd); + else if (events & EPOLLIN) + rc = migrate_target(c, vdev->device_state_fd); + + /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */ + + vdev->device_state_result = rc; + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, vdev->device_state_fd, NULL); + debug("Closing migration channel"); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; } diff --git a/vu_common.h b/vu_common.h index d56c021..69c4006 100644 --- a/vu_common.h +++ b/vu_common.h @@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, const struct timespec *now); int vu_send_single(const struct ctx *c, const void *buf, size_t size); -void vu_migrate(struct vu_dev *vdev, uint32_t events); +void vu_migrate(struct ctx *c, uint32_t events); #endif /* VU_COMMON_H */-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson