This series allows a QEMU guest to be migrated while it is connected to Passt using vhost-user interface. There are two parts: - first part enables the migration of QEMU without transferring the internal state of Passt. All connections are lost. This is done by implementing VHOST_USER_SET_LOG_FD and VHOST_USER_SET_LOG_BASE commands (and enabling VHOST_USER_PROTOCOL_F_LOG_SHMFD feature) "vhost-user: add VHOST_USER_SET_LOG_FD command" "vhost-user: add VHOST_USER_SET_LOG_BASE command" "vhost-user: Report to front-end we support VHOST_USER_PROTOCOL_F_LOG_SHMFD" - second part allows source Passt instance to send its internal state using QEMU migration channel to destination Passt instance. This is done implementing VHOST_USER_SET_DEVICE_STATE_FD and VHOST_USER_CHECK_DEVICE_STATE (and enabling VHOST_USER_PROTOCOL_F_DEVICE_STATE feature). "vhost-user: add VHOST_USER_CHECK_DEVICE_STATE command" "vhost-user: add VHOST_USER_SET_DEVICE_STATE_FD command" "vhost-user: Report to front-end we support VHOST_USER_PROTOCOL_F_DEVICE_STATE" For now, it only implements the function needed to transfer the state but no state is transferred. VHOST_USER_PROTOCOL_F_DEVICE_STATE is implemented in QEMU only for vhost-user-fs, to be able to use it with virtio-net I have proposed a patch upstream: https://patchew.org/QEMU/20241218143453.1573185-1-lvivier@redhat.com/ Laurent Vivier (9): virtio: Use const pointer for vu_dev vhost-user: update protocol features and commands list vhost-user: add VHOST_USER_SET_LOG_FD command vhost-user: Pass vu_dev to more virtio functions vhost-user: add VHOST_USER_SET_LOG_BASE command vhost-user: Report to front-end we support VHOST_USER_PROTOCOL_F_LOG_SHMFD vhost-user: add VHOST_USER_CHECK_DEVICE_STATE command vhost-user: add VHOST_USER_SET_DEVICE_STATE_FD command vhost-user: Report to front-end we support VHOST_USER_PROTOCOL_F_DEVICE_STATE epoll_type.h | 2 + passt.c | 4 + util.h | 3 + vhost_user.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++- vhost_user.h | 50 +++++++++- virtio.c | 116 +++++++++++++++++++++--- virtio.h | 32 +++++-- vu_common.c | 59 +++++++++++- vu_common.h | 3 +- 9 files changed, 484 insertions(+), 36 deletions(-) -- 2.47.1
We don't modify the structure in some virtio functions. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au> --- virtio.c | 14 +++++++++----- virtio.h | 2 +- vu_common.c | 2 +- vu_common.h | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/virtio.c b/virtio.c index a76de5e00222..625bac385f0c 100644 --- a/virtio.c +++ b/virtio.c @@ -92,7 +92,8 @@ * * Return: virtual address in our address space of the guest physical address */ -static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr) +static void *vu_gpa_to_va(const struct vu_dev *dev, uint64_t *plen, + uint64_t guest_addr) { unsigned int i; @@ -210,7 +211,8 @@ static void virtqueue_get_head(const struct vu_virtq *vq, * * Return: -1 if there is an error, 0 otherwise */ -static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc, +static int virtqueue_read_indirect_desc(const struct vu_dev *dev, + struct vring_desc *desc, uint64_t addr, size_t len) { uint64_t read_len; @@ -390,7 +392,7 @@ static inline void vring_set_avail_event(const struct vu_virtq *vq, * * Return: false on error, true otherwise */ -static bool virtqueue_map_desc(struct vu_dev *dev, +static bool virtqueue_map_desc(const struct vu_dev *dev, unsigned int *p_num_sg, struct iovec *iov, unsigned int max_num_sg, uint64_t pa, size_t sz) @@ -426,7 +428,8 @@ static bool virtqueue_map_desc(struct vu_dev *dev, * * Return: -1 if there is an error, 0 otherwise */ -static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx, +static int vu_queue_map_desc(const struct vu_dev *dev, + struct vu_virtq *vq, unsigned int idx, struct vu_virtq_element *elem) { const struct vring_desc *desc = vq->vring.desc; @@ -504,7 +507,8 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i * * Return: -1 if there is an error, 0 otherwise */ -int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) +int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, + struct vu_virtq_element *elem) { unsigned int head; int ret; diff --git a/virtio.h b/virtio.h index 6410d60f9b3f..0af259df7dac 100644 --- a/virtio.h +++ b/virtio.h @@ -170,7 +170,7 @@ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, bool vu_queue_empty(struct vu_virtq *vq); void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); -int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, +int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem); void vu_queue_detach_element(struct vu_virtq *vq); void vu_queue_unpop(struct vu_virtq *vq); diff --git a/vu_common.c b/vu_common.c index 299b5a32e244..6d365bea5fe2 100644 --- a/vu_common.c +++ b/vu_common.c @@ -73,7 +73,7 @@ void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt * * Return: number of elements used to contain the frame */ -int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq, +int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, struct vu_virtq_element *elem, int max_elem, size_t size, size_t *frame_size) { diff --git a/vu_common.h b/vu_common.h index 901d97216c67..bd70faf3e226 100644 --- a/vu_common.h +++ b/vu_common.h @@ -46,7 +46,7 @@ static inline void vu_set_element(struct vu_virtq_element *elem, void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt); -int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq, +int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, struct vu_virtq_element *elem, int max_elem, size_t size, size_t *frame_size); void vu_set_vnethdr(const struct vu_dev *vdev, -- 2.47.1
On Thu, Dec 19, 2024 at 12:13:52PM +0100, Laurent Vivier wrote:We don't modify the structure in some virtio functions. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au>Fwiw, I think this could be merged immediately regardless of the rest of the series.--- virtio.c | 14 +++++++++----- virtio.h | 2 +- vu_common.c | 2 +- vu_common.h | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/virtio.c b/virtio.c index a76de5e00222..625bac385f0c 100644 --- a/virtio.c +++ b/virtio.c @@ -92,7 +92,8 @@ * * Return: virtual address in our address space of the guest physical address */ -static void *vu_gpa_to_va(struct vu_dev *dev, uint64_t *plen, uint64_t guest_addr) +static void *vu_gpa_to_va(const struct vu_dev *dev, uint64_t *plen, + uint64_t guest_addr) { unsigned int i; @@ -210,7 +211,8 @@ static void virtqueue_get_head(const struct vu_virtq *vq, * * Return: -1 if there is an error, 0 otherwise */ -static int virtqueue_read_indirect_desc(struct vu_dev *dev, struct vring_desc *desc, +static int virtqueue_read_indirect_desc(const struct vu_dev *dev, + struct vring_desc *desc, uint64_t addr, size_t len) { uint64_t read_len; @@ -390,7 +392,7 @@ static inline void vring_set_avail_event(const struct vu_virtq *vq, * * Return: false on error, true otherwise */ -static bool virtqueue_map_desc(struct vu_dev *dev, +static bool virtqueue_map_desc(const struct vu_dev *dev, unsigned int *p_num_sg, struct iovec *iov, unsigned int max_num_sg, uint64_t pa, size_t sz) @@ -426,7 +428,8 @@ static bool virtqueue_map_desc(struct vu_dev *dev, * * Return: -1 if there is an error, 0 otherwise */ -static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned int idx, +static int vu_queue_map_desc(const struct vu_dev *dev, + struct vu_virtq *vq, unsigned int idx, struct vu_virtq_element *elem) { const struct vring_desc *desc = vq->vring.desc; @@ -504,7 +507,8 @@ static int vu_queue_map_desc(struct vu_dev *dev, struct vu_virtq *vq, unsigned i * * Return: -1 if there is an error, 0 otherwise */ -int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem) +int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, + struct vu_virtq_element *elem) { unsigned int head; int ret; diff --git a/virtio.h b/virtio.h index 6410d60f9b3f..0af259df7dac 100644 --- a/virtio.h +++ b/virtio.h @@ -170,7 +170,7 @@ static inline bool vu_has_protocol_feature(const struct vu_dev *vdev, bool vu_queue_empty(struct vu_virtq *vq); void vu_queue_notify(const struct vu_dev *dev, struct vu_virtq *vq); -int vu_queue_pop(struct vu_dev *dev, struct vu_virtq *vq, +int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, struct vu_virtq_element *elem); void vu_queue_detach_element(struct vu_virtq *vq); void vu_queue_unpop(struct vu_virtq *vq); diff --git a/vu_common.c b/vu_common.c index 299b5a32e244..6d365bea5fe2 100644 --- a/vu_common.c +++ b/vu_common.c @@ -73,7 +73,7 @@ void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt * * Return: number of elements used to contain the frame */ -int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq, +int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, struct vu_virtq_element *elem, int max_elem, size_t size, size_t *frame_size) { diff --git a/vu_common.h b/vu_common.h index 901d97216c67..bd70faf3e226 100644 --- a/vu_common.h +++ b/vu_common.h @@ -46,7 +46,7 @@ static inline void vu_set_element(struct vu_virtq_element *elem, void vu_init_elem(struct vu_virtq_element *elem, struct iovec *iov, int elem_cnt); -int vu_collect(struct vu_dev *vdev, struct vu_virtq *vq, +int vu_collect(const struct vu_dev *vdev, struct vu_virtq *vq, struct vu_virtq_element *elem, int max_elem, size_t size, size_t *frame_size); void vu_set_vnethdr(const struct vu_dev *vdev,-- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson
vhost-user protocol specification has been updated with feature flags and commands we will need to implement migration. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- vhost_user.c | 5 +++++ vhost_user.h | 46 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/vhost_user.c b/vhost_user.c index 4b8558fa851f..48226a8b7686 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -110,6 +110,11 @@ static const char *vu_request_to_string(unsigned int req) REQ(VHOST_USER_GET_MAX_MEM_SLOTS), REQ(VHOST_USER_ADD_MEM_REG), REQ(VHOST_USER_REM_MEM_REG), + REQ(VHOST_USER_SET_STATUS), + REQ(VHOST_USER_GET_STATUS), + REQ(VHOST_USER_GET_SHARED_OBJECT), + REQ(VHOST_USER_SET_DEVICE_STATE_FD), + REQ(VHOST_USER_CHECK_DEVICE_STATE), }; #undef REQ return vu_request_str[req]; diff --git a/vhost_user.h b/vhost_user.h index 464ba21e962f..fbacb5560755 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -37,6 +37,10 @@ enum vhost_user_protocol_feature { VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD = 12, VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS = 14, VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS = 15, + VHOST_USER_PROTOCOL_F_STATUS = 16, + /* Feature 17 reserved for VHOST_USER_PROTOCOL_F_XEN_MMAP. */ + VHOST_USER_PROTOCOL_F_SHARED_OBJECT = 18, + VHOST_USER_PROTOCOL_F_DEVICE_STATE = 19, VHOST_USER_PROTOCOL_F_MAX }; @@ -83,6 +87,11 @@ enum vhost_user_request { VHOST_USER_GET_MAX_MEM_SLOTS = 36, VHOST_USER_ADD_MEM_REG = 37, VHOST_USER_REM_MEM_REG = 38, + VHOST_USER_SET_STATUS = 39, + VHOST_USER_GET_STATUS = 40, + VHOST_USER_GET_SHARED_OBJECT = 41, + VHOST_USER_SET_DEVICE_STATE_FD = 42, + VHOST_USER_CHECK_DEVICE_STATE = 43, VHOST_USER_MAX }; @@ -128,12 +137,39 @@ struct vhost_user_memory { struct vhost_user_memory_region regions[VHOST_MEMORY_BASELINE_NREGIONS]; }; +/** + * struct vhost_user_log - Address and size of the shared memory region used + * to log page update + * @mmap_size: Size of the shared memory region + * @mmap_offset: Offset of the shared memory region + */ +struct vhost_user_log { + uint64_t mmap_size; + uint64_t mmap_offset; +}; + +/** + * struct vhost_user_transfer_device_state - Set the direction and phase + * of the backend device state fd + * @direction: Device state transfer direction (save or load) + * @phase: Migration phase (only stopped is supported) + */ +struct vhost_user_transfer_device_state { + uint32_t direction; +#define VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE 0 +#define VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD 1 + uint32_t phase; +#define VHOST_USER_TRANSFER_STATE_PHASE_STOPPED 0 +}; + /** * union vhost_user_payload - vhost-user message payload - * @u64: 64-bit payload - * @state: vring state payload - * @addr: vring addresses payload - * vhost_user_memory: Memory regions information payload + * @u64: 64-bit payload + * @state: vring state payload + * @addr: vring addresses payload + * @vhost_user_memory: Memory regions information payload + * @vhost_user_log: Memory logging payload + * @vhost_user_transfer_device_state: Device state payload */ union vhost_user_payload { #define VHOST_USER_VRING_IDX_MASK 0xff @@ -142,6 +178,8 @@ union vhost_user_payload { struct vhost_vring_state state; struct vhost_vring_addr addr; struct vhost_user_memory memory; + struct vhost_user_log log; + struct vhost_user_transfer_device_state transfer_state; }; /** -- 2.47.1
VHOST_USER_SET_LOG_FD is an optional message with an eventfd in ancillary data, it may be used to inform the front-end that the log has been modified. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- vhost_user.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ vhost_user.h | 1 + virtio.h | 2 ++ 3 files changed, 59 insertions(+) diff --git a/vhost_user.c b/vhost_user.c index 48226a8b7686..ce4373d9eeca 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -504,6 +504,57 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, return false; } +/** + * vu_close_log() - Close the logging file descriptor + * @vdev: vhost-user device + */ +static void vu_close_log(struct vu_dev *vdev) +{ + if (vdev->log_call_fd != -1) { + close(vdev->log_call_fd); + vdev->log_call_fd = -1; + } +} + +/** + * vu_log_kick() - Inform the front-end that the log has been modified + * @vdev: vhost-user device + */ +/* cppcheck-suppress unusedFunction */ +void vu_log_kick(const struct vu_dev *vdev) +{ + if (vdev->log_call_fd != -1) { + int rc; + + rc = eventfd_write(vdev->log_call_fd, 1); + if (rc == -1) + die_perror("vhost-user kick eventfd_write()"); + } +} + +/** + * vu_set_log_fd_exec() -- Set the eventfd used to report logging update + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + */ +static bool vu_set_log_fd_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + if (msg->fd_num != 1) + die("Invalid log_fd message"); + + if (vdev->log_call_fd != -1) + close(vdev->log_call_fd); + + vdev->log_call_fd = msg->fds[0]; + + debug("Got log_call_fd: %d", vdev->log_call_fd); + + return false; +} + /** * vu_set_vring_num_exec() - Set the size of the queue (vring size) * @vdev: vhost-user device @@ -864,8 +915,10 @@ void vu_init(struct ctx *c) .notification = true, }; } + c->vdev->log_call_fd = -1; } + /** * vu_cleanup() - Reset vhost-user device * @vdev: vhost-user device @@ -909,6 +962,8 @@ void vu_cleanup(struct vu_dev *vdev) } } vdev->nregions = 0; + + vu_close_log(vdev); } /** @@ -929,6 +984,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, [VHOST_USER_GET_QUEUE_NUM] = vu_get_queue_num_exec, [VHOST_USER_SET_OWNER] = vu_set_owner_exec, [VHOST_USER_SET_MEM_TABLE] = vu_set_mem_table_exec, + [VHOST_USER_SET_LOG_FD] = vu_set_log_fd_exec, [VHOST_USER_SET_VRING_NUM] = vu_set_vring_num_exec, [VHOST_USER_SET_VRING_ADDR] = vu_set_vring_addr_exec, [VHOST_USER_SET_VRING_BASE] = vu_set_vring_base_exec, diff --git a/vhost_user.h b/vhost_user.h index fbacb5560755..2fc0342ff5ba 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -240,5 +240,6 @@ static inline bool vu_queue_started(const struct vu_virtq *vq) void vu_print_capabilities(void); void vu_init(struct ctx *c); void vu_cleanup(struct vu_dev *vdev); +void vu_log_kick(const struct vu_dev *vdev); void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events); #endif /* VHOST_USER_H */ diff --git a/virtio.h b/virtio.h index 0af259df7dac..3b0df343d6b6 100644 --- a/virtio.h +++ b/virtio.h @@ -103,6 +103,7 @@ struct vu_dev_region { * @regions: Guest shared memory regions * @features: Vhost-user features * @protocol_features: Vhost-user protocol features + * @log_call_fd: Eventfd to report logging update */ struct vu_dev { struct ctx *context; @@ -111,6 +112,7 @@ struct vu_dev { struct vu_virtq vq[VHOST_USER_MAX_QUEUES]; uint64_t features; uint64_t protocol_features; + int log_call_fd; }; /** -- 2.47.1
vu_dev will be needed to log page update. Add the parameter to: vring_used_write() vu_queue_fill_by_index() vu_queue_fill() vring_used_idx_set() vu_queue_flush() The new parameter is unused for now. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- virtio.c | 32 ++++++++++++++++++++++---------- virtio.h | 10 ++++++---- vu_common.c | 8 ++++---- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/virtio.c b/virtio.c index 625bac385f0c..52d5a4d4be52 100644 --- a/virtio.c +++ b/virtio.c @@ -580,28 +580,34 @@ bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num) /** * vring_used_write() - Write an entry in the used ring + * @dev: Vhost-user device * @vq: Virtqueue * @uelem: Entry to write * @i: Index of the entry in the used ring */ -static inline void vring_used_write(struct vu_virtq *vq, +static inline void vring_used_write(const struct vu_dev *vdev, + struct vu_virtq *vq, const struct vring_used_elem *uelem, int i) { struct vring_used *used = vq->vring.used; used->ring[i] = *uelem; + (void)vdev; } + /** * vu_queue_fill_by_index() - Update information of a descriptor ring entry * in the used ring + * @dev: Vhost-user device * @vq: Virtqueue * @index: Descriptor ring index * @len: Size of the element * @idx: Used ring entry index */ -void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, - unsigned int len, unsigned int idx) +void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int index, unsigned int len, + unsigned int idx) { struct vring_used_elem uelem; @@ -612,7 +618,7 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, uelem.id = htole32(index); uelem.len = htole32(len); - vring_used_write(vq, &uelem, idx); + vring_used_write(vdev, vq, &uelem, idx); } /** @@ -623,30 +629,36 @@ void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, * @len: Size of the element * @idx: Used ring entry index */ -void vu_queue_fill(struct vu_virtq *vq, const struct vu_virtq_element *elem, - unsigned int len, unsigned int idx) +void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, + const struct vu_virtq_element *elem, unsigned int len, + unsigned int idx) { - vu_queue_fill_by_index(vq, elem->index, len, idx); + vu_queue_fill_by_index(vdev, vq, elem->index, len, idx); } /** * vring_used_idx_set() - Set the descriptor ring current index + * @dev: Vhost-user device * @vq: Virtqueue * @val: Value to set in the index */ -static inline void vring_used_idx_set(struct vu_virtq *vq, uint16_t val) +static inline void vring_used_idx_set(const struct vu_dev *vdev, + struct vu_virtq *vq, uint16_t val) { vq->vring.used->idx = htole16(val); + (void)vdev; vq->used_idx = val; } /** * vu_queue_flush() - Flush the virtqueue + * @dev: Vhost-user device * @vq: Virtqueue * @count: Number of entry to flush */ -void vu_queue_flush(struct vu_virtq *vq, unsigned int count) +void vu_queue_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int count) { uint16_t old, new; @@ -658,7 +670,7 @@ void vu_queue_flush(struct vu_virtq *vq, unsigned int count) old = vq->used_idx; new = old + count; - vring_used_idx_set(vq, new); + vring_used_idx_set(vdev, vq, new); vq->inuse -= count; if ((uint16_t)(new - vq->signalled_used) < (uint16_t)(new - old)) vq->signalled_used_valid = false; diff --git a/virtio.h b/virtio.h index 3b0df343d6b6..d95bb07bb913 100644 --- a/virtio.h +++ b/virtio.h @@ -177,10 +177,12 @@ int vu_queue_pop(const struct vu_dev *dev, struct vu_virtq *vq, void vu_queue_detach_element(struct vu_virtq *vq); void vu_queue_unpop(struct vu_virtq *vq); bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num); -void vu_queue_fill_by_index(struct vu_virtq *vq, unsigned int index, - unsigned int len, unsigned int idx); -void vu_queue_fill(struct vu_virtq *vq, +void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int index, unsigned int len, + unsigned int idx); +void vu_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, const struct vu_virtq_element *elem, unsigned int len, unsigned int idx); -void vu_queue_flush(struct vu_virtq *vq, unsigned int count); +void vu_queue_flush(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int count); #endif /* VIRTIO_H */ diff --git a/vu_common.c b/vu_common.c index 6d365bea5fe2..16e7e76a07f3 100644 --- a/vu_common.c +++ b/vu_common.c @@ -140,9 +140,9 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, int i; for (i = 0; i < elem_cnt; i++) - vu_queue_fill(vq, &elem[i], elem[i].in_sg[0].iov_len, i); + vu_queue_fill(vdev, vq, &elem[i], elem[i].in_sg[0].iov_len, i); - vu_queue_flush(vq, elem_cnt); + vu_queue_flush(vdev, vq, elem_cnt); vu_queue_notify(vdev, vq); } @@ -194,8 +194,8 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, int i; for (i = 0; i < count; i++) - vu_queue_fill(vq, &elem[i], 0, i); - vu_queue_flush(vq, count); + vu_queue_fill(vdev, vq, &elem[i], 0, i); + vu_queue_flush(vdev, vq, count); vu_queue_notify(vdev, vq); } } -- 2.47.1
Sets logging shared memory space. When the back-end has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature, the log memory fd is provided in the ancillary data of VHOST_USER_SET_LOG_BASE message, the size and offset of shared memory area provided in the message. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- util.h | 3 ++ vhost_user.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++- vhost_user.h | 3 ++ virtio.c | 74 ++++++++++++++++++++++++++++++++++++++++++-- virtio.h | 4 +++ 5 files changed, 168 insertions(+), 3 deletions(-) diff --git a/util.h b/util.h index 3fa1d12544a0..d02333d5a88d 100644 --- a/util.h +++ b/util.h @@ -152,6 +152,9 @@ static inline void barrier(void) { __asm__ __volatile__("" ::: "memory"); } #define smp_wmb() smp_mb_release() #define smp_rmb() smp_mb_acquire() +#define qatomic_or(ptr, n) \ + ((void) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)) + #define NS_FN_STACK_SIZE (1024 * 1024) /* 1MiB */ int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags, diff --git a/vhost_user.c b/vhost_user.c index ce4373d9eeca..c2fac58badf1 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -510,6 +510,12 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, */ static void vu_close_log(struct vu_dev *vdev) { + if (vdev->log_table) { + if (munmap(vdev->log_table, vdev->log_size) != 0) + die_perror("close log munmap() error"); + vdev->log_table = NULL; + } + if (vdev->log_call_fd != -1) { close(vdev->log_call_fd); vdev->log_call_fd = -1; @@ -520,7 +526,6 @@ static void vu_close_log(struct vu_dev *vdev) * vu_log_kick() - Inform the front-end that the log has been modified * @vdev: vhost-user device */ -/* cppcheck-suppress unusedFunction */ void vu_log_kick(const struct vu_dev *vdev) { if (vdev->log_call_fd != -1) { @@ -532,6 +537,84 @@ void vu_log_kick(const struct vu_dev *vdev) } } + +/** + * vu_log_page() -- Update logging table + * @log_table: Base address of the logging table + * @page: Page number that has been updated + */ +/* NOLINTNEXTLINE(readability-non-const-parameter) */ +static void vu_log_page(uint8_t *log_table, uint64_t page) +{ + qatomic_or(&log_table[page / 8], 1 << (page % 8)); +} + +/** + * vu_log_write() -- Log memory write + * @dev: Vhost-user device + * @address: Memory address + * @length: Memory size + */ +void vu_log_write(const struct vu_dev *vdev, uint64_t address, uint64_t length) +{ + uint64_t page; + + if (!vdev->log_table || !length || + !vu_has_feature(vdev, VHOST_F_LOG_ALL)) + return; + + page = address / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < address + length) { + vu_log_page(vdev->log_table, page); + page++; + } + vu_log_kick(vdev); +} + +/** + * vu_set_log_base_exec() - Set the memory log base + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: False as no reply is requested + * + * #syscalls:vu mmap munmap + */ +static bool vu_set_log_base_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + uint64_t log_mmap_size, log_mmap_offset; + void *base; + int fd; + + if (msg->fd_num != 1 || msg->hdr.size != sizeof(msg->payload.log)) + die("Invalid log_base message"); + + fd = msg->fds[0]; + log_mmap_offset = msg->payload.log.mmap_offset; + log_mmap_size = msg->payload.log.mmap_size; + + debug("Log mmap_offset: %"PRId64, log_mmap_offset); + debug("Log mmap_size: %"PRId64, log_mmap_size); + + base = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, + log_mmap_offset); + close(fd); + if (base == MAP_FAILED) + die("log mmap error"); + + if (vdev->log_table) + munmap(vdev->log_table, vdev->log_size); + + vdev->log_table = base; + vdev->log_size = log_mmap_size; + + msg->hdr.size = sizeof(msg->payload.u64); + msg->fd_num = 0; + + return true; +} + /** * vu_set_log_fd_exec() -- Set the eventfd used to report logging update * @vdev: vhost-user device @@ -915,6 +998,7 @@ void vu_init(struct ctx *c) .notification = true, }; } + c->vdev->log_table = NULL; c->vdev->log_call_fd = -1; } @@ -984,6 +1068,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, [VHOST_USER_GET_QUEUE_NUM] = vu_get_queue_num_exec, [VHOST_USER_SET_OWNER] = vu_set_owner_exec, [VHOST_USER_SET_MEM_TABLE] = vu_set_mem_table_exec, + [VHOST_USER_SET_LOG_BASE] = vu_set_log_base_exec, [VHOST_USER_SET_LOG_FD] = vu_set_log_fd_exec, [VHOST_USER_SET_VRING_NUM] = vu_set_vring_num_exec, [VHOST_USER_SET_VRING_ADDR] = vu_set_vring_addr_exec, diff --git a/vhost_user.h b/vhost_user.h index 2fc0342ff5ba..22a5d059073f 100644 --- a/vhost_user.h +++ b/vhost_user.h @@ -15,6 +15,7 @@ #include "iov.h" #define VHOST_USER_F_PROTOCOL_FEATURES 30 +#define VHOST_LOG_PAGE 4096 #define VHOST_MEMORY_BASELINE_NREGIONS 8 @@ -241,5 +242,7 @@ void vu_print_capabilities(void); void vu_init(struct ctx *c); void vu_cleanup(struct vu_dev *vdev); void vu_log_kick(const struct vu_dev *vdev); +void vu_log_write(const struct vu_dev *vdev, uint64_t address, + uint64_t length); void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events); #endif /* VHOST_USER_H */ diff --git a/virtio.c b/virtio.c index 52d5a4d4be52..13838586ad1a 100644 --- a/virtio.c +++ b/virtio.c @@ -81,6 +81,7 @@ #include "util.h" #include "virtio.h" +#include "vhost_user.h" #define VIRTQUEUE_MAX_SIZE 1024 @@ -592,7 +593,72 @@ static inline void vring_used_write(const struct vu_dev *vdev, struct vring_used *used = vq->vring.used; used->ring[i] = *uelem; - (void)vdev; + vu_log_write(vdev, vq->vring.log_guest_addr + + offsetof(struct vring_used, ring[i]), + sizeof(used->ring[i])); +} + +/** + * vu_log_queue_fill() -- Log virtqueue memory update + * @dev: Vhost-user device + * @vq: Virtqueue + * @index: Descriptor ring index + * @len: Size of the element + */ +static void vu_log_queue_fill(const struct vu_dev *vdev, struct vu_virtq *vq, + unsigned int index, unsigned int len) +{ + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; + struct vring_desc *desc = vq->vring.desc; + unsigned int max, min; + unsigned num_bufs = 0; + uint64_t read_len; + + if (!vdev->log_table || !len || !vu_has_feature(vdev, VHOST_F_LOG_ALL)) + return; + + max = vq->vring.num; + + if (le16toh(desc[index].flags) & VRING_DESC_F_INDIRECT) { + unsigned int desc_len; + uint64_t desc_addr; + + if (le32toh(desc[index].len) % sizeof(struct vring_desc)) + die("Invalid size for indirect buffer table"); + + /* loop over the indirect descriptor table */ + desc_addr = le64toh(desc[index].addr); + desc_len = le32toh(desc[index].len); + max = desc_len / sizeof(struct vring_desc); + read_len = desc_len; + desc = vu_gpa_to_va(vdev, &read_len, desc_addr); + if (desc && read_len != desc_len) { + /* Failed to use zero copy */ + desc = NULL; + if (!virtqueue_read_indirect_desc(vdev, desc_buf, + desc_addr, + desc_len)) + desc = desc_buf; + } + + if (!desc) + die("Invalid indirect buffer table"); + + index = 0; + } + + do { + if (++num_bufs > max) + die("Looped descriptor"); + + if (le16toh(desc[index].flags) & VRING_DESC_F_WRITE) { + min = MIN(le32toh(desc[index].len), len); + vu_log_write(vdev, le64toh(desc[index].addr), min); + len -= min; + } + } while (len > 0 && + (virtqueue_read_next_desc(desc, index, max, &index) == + VIRTQUEUE_READ_DESC_MORE)); } @@ -614,6 +680,8 @@ void vu_queue_fill_by_index(const struct vu_dev *vdev, struct vu_virtq *vq, if (!vq->vring.avail) return; + vu_log_queue_fill(vdev, vq, index, len); + idx = (idx + vq->used_idx) % vq->vring.num; uelem.id = htole32(index); @@ -646,7 +714,9 @@ static inline void vring_used_idx_set(const struct vu_dev *vdev, struct vu_virtq *vq, uint16_t val) { vq->vring.used->idx = htole16(val); - (void)vdev; + vu_log_write(vdev, vq->vring.log_guest_addr + + offsetof(struct vring_used, idx), + sizeof(vq->vring.used->idx)); vq->used_idx = val; } diff --git a/virtio.h b/virtio.h index d95bb07bb913..f572341a0034 100644 --- a/virtio.h +++ b/virtio.h @@ -104,6 +104,8 @@ struct vu_dev_region { * @features: Vhost-user features * @protocol_features: Vhost-user protocol features * @log_call_fd: Eventfd to report logging update + * @log_size: Size of the logging memory region + * @log_table: Base of the logging memory region */ struct vu_dev { struct ctx *context; @@ -113,6 +115,8 @@ struct vu_dev { uint64_t features; uint64_t protocol_features; int log_call_fd; + uint64_t log_size; + uint8_t *log_table; }; /** -- 2.47.1
This features allows QEMU to be migrated. We need also to report VHOST_F_LOG_ALL. This protocol feature reports we can log the page update and implement VHOST_USER_SET_LOG_BASE and VHOST_USER_SET_LOG_FD. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- vhost_user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vhost_user.c b/vhost_user.c index c2fac58badf1..3264949e349e 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -334,6 +334,7 @@ static bool vu_get_features_exec(struct vu_dev *vdev, uint64_t features = 1ULL << VIRTIO_F_VERSION_1 | 1ULL << VIRTIO_NET_F_MRG_RXBUF | + 1ULL << VHOST_F_LOG_ALL | 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; (void)vdev; @@ -912,7 +913,8 @@ static bool vu_set_vring_err_exec(struct vu_dev *vdev, static bool vu_get_protocol_features_exec(struct vu_dev *vdev, struct vhost_user_msg *msg) { - uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK; + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | + 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; (void)vdev; vmsg_set_reply_u64(msg, features); -- 2.47.1
After transferring the back-end’s internal state during migration, check whether the back-end was able to successfully fully process the state. The value returned indicates success or error; 0 is success, any non-zero value is an error. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- vhost_user.c | 21 +++++++++++++++++++++ virtio.h | 18 ++++++++++-------- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/vhost_user.c b/vhost_user.c index 3264949e349e..90c46d5b89fd 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -981,6 +981,23 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, return false; } +/** + * vu_check_device_state_exec() -- Return device state migration result + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as the reply contains the migration result + */ +static bool vu_check_device_state_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + (void)vdev; + + vmsg_set_reply_u64(msg, vdev->device_state_result); + + return true; +} + /** * vu_init() - Initialize vhost-user device structure * @c: execution context @@ -1002,6 +1019,7 @@ void vu_init(struct ctx *c) } c->vdev->log_table = NULL; c->vdev->log_call_fd = -1; + c->vdev->device_state_result = -1; } @@ -1050,6 +1068,8 @@ void vu_cleanup(struct vu_dev *vdev) vdev->nregions = 0; vu_close_log(vdev); + + vdev->device_state_result = -1; } /** @@ -1080,6 +1100,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, [VHOST_USER_SET_VRING_CALL] = vu_set_vring_call_exec, [VHOST_USER_SET_VRING_ERR] = vu_set_vring_err_exec, [VHOST_USER_SET_VRING_ENABLE] = vu_set_vring_enable_exec, + [VHOST_USER_CHECK_DEVICE_STATE] = vu_check_device_state_exec, }; /** diff --git a/virtio.h b/virtio.h index f572341a0034..512ec1bedcd3 100644 --- a/virtio.h +++ b/virtio.h @@ -98,14 +98,15 @@ struct vu_dev_region { /** * struct vu_dev - vhost-user device information - * @context: Execution context - * @nregions: Number of shared memory regions - * @regions: Guest shared memory regions - * @features: Vhost-user features - * @protocol_features: Vhost-user protocol features - * @log_call_fd: Eventfd to report logging update - * @log_size: Size of the logging memory region - * @log_table: Base of the logging memory region + * @context: Execution context + * @nregions: Number of shared memory regions + * @regions: Guest shared memory regions + * @features: Vhost-user features + * @protocol_features: Vhost-user protocol features + * @log_call_fd: Eventfd to report logging update + * @log_size: Size of the logging memory region + * @log_table: Base of the logging memory region + * @device_state_result: Device state migration result */ struct vu_dev { struct ctx *context; @@ -117,6 +118,7 @@ struct vu_dev { int log_call_fd; uint64_t log_size; uint8_t *log_table; + int device_state_result; }; /** -- 2.47.1
Set the file descriptor to use to transfer the backend device state during migration. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- epoll_type.h | 2 ++ passt.c | 4 +++ vhost_user.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++- virtio.h | 2 ++ vu_common.c | 49 +++++++++++++++++++++++++++++++ vu_common.h | 1 + 6 files changed, 138 insertions(+), 1 deletion(-) diff --git a/epoll_type.h b/epoll_type.h index f3ef41584757..fd9eac392f77 100644 --- a/epoll_type.h +++ b/epoll_type.h @@ -40,6 +40,8 @@ enum epoll_type { EPOLL_TYPE_VHOST_CMD, /* vhost-user kick event socket */ EPOLL_TYPE_VHOST_KICK, + /* vhost-user migration socket */ + EPOLL_TYPE_VHOST_MIGRATION, EPOLL_NUM_TYPES, }; diff --git a/passt.c b/passt.c index 957f3d0f4ddc..25d9823739cf 100644 --- a/passt.c +++ b/passt.c @@ -75,6 +75,7 @@ char *epoll_type_str[] = { [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", + [EPOLL_TYPE_VHOST_MIGRATION] = "vhost-user migration socket", }; static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, "epoll_type_str[] doesn't match enum epoll_type"); @@ -356,6 +357,9 @@ loop: case EPOLL_TYPE_VHOST_KICK: vu_kick_cb(c.vdev, ref, &now); break; + case EPOLL_TYPE_VHOST_MIGRATION: + vu_migrate(c.vdev, eventmask); + break; default: /* Can't happen */ ASSERT(0); diff --git a/vhost_user.c b/vhost_user.c index 90c46d5b89fd..11b0b447850d 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -981,6 +981,78 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, return false; } +/** + * vu_set_migration_watch() -- Add the migration file descriptor to + * to the passt epoll file descriptor + * @vdev: vhost-user device + * @fd: File descriptor to add + * @direction: Direction of the migration (save or load backend state) + */ +static void vu_set_migration_watch(const struct vu_dev *vdev, int fd, + int direction) +{ + union epoll_ref ref = { + .type = EPOLL_TYPE_VHOST_MIGRATION, + .fd = fd, + }; + struct epoll_event ev = { 0 }; + + ev.data.u64 = ref.u64; + switch (direction) { + case VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE: + ev.events = EPOLLOUT; + break; + case VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD: + ev.events = EPOLLIN; + break; + default: + ASSERT(0); + } + + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev); +} + +/** + * vu_set_device_state_fd_exec() -- Set the device state migration channel + * @vdev: vhost-user device + * @vmsg: vhost-user message + * + * Return: True as the reply contains 0 to indicate success + * and set bit 8 as we don't provide our own fd. + */ +static bool vu_set_device_state_fd_exec(struct vu_dev *vdev, + struct vhost_user_msg *msg) +{ + unsigned int direction = msg->payload.transfer_state.direction; + unsigned int phase = msg->payload.transfer_state.phase; + + if (msg->fd_num != 1) + die("Invalid device_state_fd message"); + + if (phase != VHOST_USER_TRANSFER_STATE_PHASE_STOPPED) + die("Invalid device_state_fd phase: %d", phase); + + if (direction != VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE && + direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD) + die("Invalide device_state_fd direction: %d", direction); + + if (vdev->device_state_fd != -1) { + vu_remove_watch(vdev, vdev->device_state_fd); + close(vdev->device_state_fd); + } + + vdev->device_state_fd = msg->fds[0]; + vdev->device_state_result = -1; + vu_set_migration_watch(vdev, vdev->device_state_fd, direction); + + debug("Got device_state_fd: %d", vdev->device_state_fd); + + /* We don't provide a new fd for the data transfer */ + vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK); + + return true; +} + /** * vu_check_device_state_exec() -- Return device state migration result * @vdev: vhost-user device @@ -1019,6 +1091,7 @@ void vu_init(struct ctx *c) } c->vdev->log_table = NULL; c->vdev->log_call_fd = -1; + c->vdev->device_state_fd = -1; c->vdev->device_state_result = -1; } @@ -1069,7 +1142,12 @@ void vu_cleanup(struct vu_dev *vdev) vu_close_log(vdev); - vdev->device_state_result = -1; + if (vdev->device_state_fd != -1) { + vu_remove_watch(vdev, vdev->device_state_fd); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + vdev->device_state_result = -1; + } } /** @@ -1100,6 +1178,7 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, [VHOST_USER_SET_VRING_CALL] = vu_set_vring_call_exec, [VHOST_USER_SET_VRING_ERR] = vu_set_vring_err_exec, [VHOST_USER_SET_VRING_ENABLE] = vu_set_vring_enable_exec, + [VHOST_USER_SET_DEVICE_STATE_FD] = vu_set_device_state_fd_exec, [VHOST_USER_CHECK_DEVICE_STATE] = vu_check_device_state_exec, }; diff --git a/virtio.h b/virtio.h index 512ec1bedcd3..7bef2d274acd 100644 --- a/virtio.h +++ b/virtio.h @@ -106,6 +106,7 @@ struct vu_dev_region { * @log_call_fd: Eventfd to report logging update * @log_size: Size of the logging memory region * @log_table: Base of the logging memory region + * @device_state_fd: Device state migration channel * @device_state_result: Device state migration result */ struct vu_dev { @@ -118,6 +119,7 @@ struct vu_dev { int log_call_fd; uint64_t log_size; uint8_t *log_table; + int device_state_fd; int device_state_result; }; diff --git a/vu_common.c b/vu_common.c index 16e7e76a07f3..3142b585c29f 100644 --- a/vu_common.c +++ b/vu_common.c @@ -281,3 +281,52 @@ err: return -1; } + +/** + * vu_migrate() -- Send/receive passt insternal state to/from QEMU + * @vdev: vhost-user device + * @events: epoll events + */ +void vu_migrate(struct vu_dev *vdev, uint32_t events) +{ + int ret; + + /* TODO: collect/set passt internal state + * and use vdev->device_state_fd to send/receive it + */ + debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); + if (events & EPOLLOUT) { + debug("Saving backend state"); + + /* send some stuff */ + ret = write(vdev->device_state_fd, "PASST", 6); + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ + vdev->device_state_result = ret == -1 ? -1 : 0; + /* Closing the file descriptor signals the end of transfer */ + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, + vdev->device_state_fd, NULL); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + } else if (events & EPOLLIN) { + char buf[6]; + + debug("Loading backend state"); + /* read some stuff */ + ret = read(vdev->device_state_fd, buf, sizeof(buf)); + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ + if (ret != sizeof(buf)) { + vdev->device_state_result = -1; + } else { + ret = strncmp(buf, "PASST", sizeof(buf)); + vdev->device_state_result = ret == 0 ? 0 : -1; + } + } else if (events & EPOLLHUP) { + debug("Closing migration channel"); + + /* The end of file signals the end of the transfer. */ + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, + vdev->device_state_fd, NULL); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + } +} diff --git a/vu_common.h b/vu_common.h index bd70faf3e226..d56c021ab0f9 100644 --- a/vu_common.h +++ b/vu_common.h @@ -57,4 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, const struct timespec *now); int vu_send_single(const struct ctx *c, const void *buf, size_t size); +void vu_migrate(struct vu_dev *vdev, uint32_t events); #endif /* VU_COMMON_H */ -- 2.47.1
On Thu, 19 Dec 2024 12:13:59 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:+/** + * vu_migrate() -- Send/receive passt insternal state to/from QEMUMagic!+ * @vdev: vhost-user device + * @events: epoll events + */ +void vu_migrate(struct vu_dev *vdev, uint32_t events) +{ + int ret; + + /* TODO: collect/set passt internal state + * and use vdev->device_state_fd to send/receive it + */ + debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); + if (events & EPOLLOUT) {I haven't really reviewed the series yet, but I have a preliminary question: how does the hypervisor tell us that we're writing too much? I guess we'll do a short write and we'll need to go back to EPOLLOUT? There's no minimum chunk size we can write, correct?+ debug("Saving backend state"); + + /* send some stuff */ + ret = write(vdev->device_state_fd, "PASST", 6); + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ + vdev->device_state_result = ret == -1 ? -1 : 0; + /* Closing the file descriptor signals the end of transfer */ + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, + vdev->device_state_fd, NULL); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + } else if (events & EPOLLIN) {...and similarly here, I guess we'll get a short read?+ char buf[6]; + + debug("Loading backend state"); + /* read some stuff */ + ret = read(vdev->device_state_fd, buf, sizeof(buf)); + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ + if (ret != sizeof(buf)) { + vdev->device_state_result = -1; + } else { + ret = strncmp(buf, "PASST", sizeof(buf)); + vdev->device_state_result = ret == 0 ? 0 : -1; + } + } else if (events & EPOLLHUP) { + debug("Closing migration channel"); + + /* The end of file signals the end of the transfer. */ + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, + vdev->device_state_fd, NULL); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + } +}-- Stefano
On 19/12/2024 20:47, Stefano Brivio wrote:On Thu, 19 Dec 2024 12:13:59 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:The hypervisor is not aware of anything. It's only a bridge between the two passt instances. But normally the destination side can report an error, this aborts the migration on both sides. In QEMU, the code is: while (true) { ssize_t read_ret; /* read the data from the backend */ read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size)); if (read_ret < 0) { ret = -errno; error_setg_errno(errp, -ret, "Failed to receive state"); goto fail; } assert(read_ret <= chunk_size); qemu_put_be32(f, read_ret); if (read_ret == 0) { /* EOF */ break; } /* send to destination QEMU */ qemu_put_buffer(f, transfer_buf, read_ret); } /* * Back-end will not really care, but be clean and close our end of the pipe * before inquiring the back-end about whether transfer was successful */ close(read_fd); On the other side: while (true) { size_t this_chunk_size = qemu_get_be32(f); ssize_t write_ret; const uint8_t *transfer_pointer; if (this_chunk_size == 0) { /* End of state */ break; } if (transfer_buf_size < this_chunk_size) { transfer_buf = g_realloc(transfer_buf, this_chunk_size); transfer_buf_size = this_chunk_size; } if (qemu_get_buffer(f, transfer_buf, this_chunk_size) < this_chunk_size) { error_setg(errp, "Failed to read state"); ret = -EINVAL; goto fail; } transfer_pointer = transfer_buf; while (this_chunk_size > 0) { write_ret = RETRY_ON_EINTR( write(write_fd, transfer_pointer, this_chunk_size) ); if (write_ret < 0) { ret = -errno; error_setg_errno(errp, -ret, "Failed to send state"); goto fail; } else if (write_ret == 0) { error_setg(errp, "Failed to send state: Connection is closed"); ret = -ECONNRESET; goto fail; } assert(write_ret <= this_chunk_size); this_chunk_size -= write_ret; transfer_pointer += write_ret; } } /* * Close our end, thus ending transfer, before inquiring the back-end about * whether transfer was successful */ close(write_fd); Moreover, I think it's important to know, the source side is stopped at the end of migration but it is in a state it can be restarted.+/** + * vu_migrate() -- Send/receive passt insternal state to/from QEMUMagic!+ * @vdev: vhost-user device + * @events: epoll events + */ +void vu_migrate(struct vu_dev *vdev, uint32_t events) +{ + int ret; + + /* TODO: collect/set passt internal state + * and use vdev->device_state_fd to send/receive it + */ + debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); + if (events & EPOLLOUT) {I haven't really reviewed the series yet, but I have a preliminary question: how does the hypervisor tell us that we're writing too much?I guess we'll do a short write and we'll need to go back to EPOLLOUT? There's no minimum chunk size we can write, correct?Correct. It works even if there is no write() in this code. The hypervisor reads until we close the file descriptor (it's a pipe in fact).We must read until we get a close().+ debug("Saving backend state"); + + /* send some stuff */ + ret = write(vdev->device_state_fd, "PASST", 6); + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ + vdev->device_state_result = ret == -1 ? -1 : 0; + /* Closing the file descriptor signals the end of transfer */ + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, + vdev->device_state_fd, NULL); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + } else if (events & EPOLLIN) {...and similarly here, I guess we'll get a short read?Thanks, Laurent+ char buf[6]; + + debug("Loading backend state"); + /* read some stuff */ + ret = read(vdev->device_state_fd, buf, sizeof(buf)); + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ + if (ret != sizeof(buf)) { + vdev->device_state_result = -1; + } else { + ret = strncmp(buf, "PASST", sizeof(buf)); + vdev->device_state_result = ret == 0 ? 0 : -1; + } + } else if (events & EPOLLHUP) { + debug("Closing migration channel"); + + /* The end of file signals the end of the transfer. */ + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, + vdev->device_state_fd, NULL); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + } +}
On Fri, 20 Dec 2024 08:56:27 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:On 19/12/2024 20:47, Stefano Brivio wrote:Well, conceptually, yes, but:On Thu, 19 Dec 2024 12:13:59 +0100 Laurent Vivier <lvivier(a)redhat.com> wrote:The hypervisor is not aware of anything. It's only a bridge between the two passt instances.+/** + * vu_migrate() -- Send/receive passt insternal state to/from QEMUMagic!+ * @vdev: vhost-user device + * @events: epoll events + */ +void vu_migrate(struct vu_dev *vdev, uint32_t events) +{ + int ret; + + /* TODO: collect/set passt internal state + * and use vdev->device_state_fd to send/receive it + */ + debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); + if (events & EPOLLOUT) {I haven't really reviewed the series yet, but I have a preliminary question: how does the hypervisor tell us that we're writing too much?But normally the destination side can report an error, this aborts the migration on both sides. In QEMU, the code is:[Thanks, this makes everything much easier]while (true) { ssize_t read_ret; /* read the data from the backend */ read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size));...there are two buffers involved here: the file buffer, and QEMU's 'transfer_buf'. While 'transfer_buf' is handled transparently (read and write right away... except that if writing fails, with the current implementation, we'll lose data, I guess), the (kernel) buffer we write to has a size limit. If we write faster than QEMU can call read() on it, we'll fill it up. Of course, this won't happen if we write "PASST", but if we transfer a full flow table, or, perhaps one day, socket queues, that's megabytes... so we'll have to handle partial writes I suppose. Anyway, it's surely not troublesome right now, I was just curious to see how this is handled.if (read_ret < 0) { ret = -errno; error_setg_errno(errp, -ret, "Failed to receive state"); goto fail; } assert(read_ret <= chunk_size); qemu_put_be32(f, read_ret); if (read_ret == 0) { /* EOF */ break; } /* send to destination QEMU */ qemu_put_buffer(f, transfer_buf, read_ret); } /* * Back-end will not really care, but be clean and close our end of the pipe * before inquiring the back-end about whether transfer was successful */ close(read_fd); On the other side: while (true) { size_t this_chunk_size = qemu_get_be32(f); ssize_t write_ret; const uint8_t *transfer_pointer; if (this_chunk_size == 0) { /* End of state */ break; } if (transfer_buf_size < this_chunk_size) { transfer_buf = g_realloc(transfer_buf, this_chunk_size); transfer_buf_size = this_chunk_size; } if (qemu_get_buffer(f, transfer_buf, this_chunk_size) < this_chunk_size) { error_setg(errp, "Failed to read state"); ret = -EINVAL; goto fail; } transfer_pointer = transfer_buf; while (this_chunk_size > 0) { write_ret = RETRY_ON_EINTR( write(write_fd, transfer_pointer, this_chunk_size) ); if (write_ret < 0) { ret = -errno; error_setg_errno(errp, -ret, "Failed to send state"); goto fail; } else if (write_ret == 0) { error_setg(errp, "Failed to send state: Connection is closed"); ret = -ECONNRESET; goto fail; } assert(write_ret <= this_chunk_size); this_chunk_size -= write_ret; transfer_pointer += write_ret; } } /* * Close our end, thus ending transfer, before inquiring the back-end about * whether transfer was successful */ close(write_fd); Moreover, I think it's important to know, the source side is stopped at the end of migration but it is in a state it can be restarted.You mean we'll get EPOLLHUP and we know we should terminate at that point, right? Or is there another mechanism?Ah, sorry, I meant: there's no mimimum chunk size _we are guaranteed to be able to write_. That is, if we fail after "P", we'll need to note down the progress and go back to wait for EPOLLOUT. Not necessarily at the moment, as the context we transfer is small, but I guess we'll need to handle short writes eventually.I guess we'll do a short write and we'll need to go back to EPOLLOUT? There's no minimum chunk size we can write, correct?Correct. It works even if there is no write() in this code. The hypervisor reads until we close the file descriptor (it's a pipe in fact).Yes, I get that, but symmetrically with the case above, if we get EAGAIN, I guess we'll need to go back to waiting on EPOLLIN.We must read until we get a close().+ debug("Saving backend state"); + + /* send some stuff */ + ret = write(vdev->device_state_fd, "PASST", 6); + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ + vdev->device_state_result = ret == -1 ? -1 : 0; + /* Closing the file descriptor signals the end of transfer */ + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, + vdev->device_state_fd, NULL); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; + } else if (events & EPOLLIN) {...and similarly here, I guess we'll get a short read?>> + char buf[6]; >> + >> + debug("Loading backend state"); >> + /* read some stuff */ >> + ret = read(vdev->device_state_fd, buf, sizeof(buf)); >> + /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ >> + if (ret != sizeof(buf)) { >> + vdev->device_state_result = -1; >> + } else { >> + ret = strncmp(buf, "PASST", sizeof(buf)); >> + vdev->device_state_result = ret == 0 ? 0 : -1; >> + } >> + } else if (events & EPOLLHUP) { >> + debug("Closing migration channel"); >> + >> + /* The end of file signals the end of the transfer. */ >> + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, >> + vdev->device_state_fd, NULL); >> + close(vdev->device_state_fd); >> + vdev->device_state_fd = -1; >> + } >> +}-- Stefano
Report to front-end that we support device state commands: VHOST_USER_CHECK_DEVICE_STATE VHOST_USER_SET_LOG_BASE These feature is needed to transfer backend state using frontend channel. Signed-off-by: Laurent Vivier <lvivier(a)redhat.com> --- vhost_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vhost_user.c b/vhost_user.c index 11b0b447850d..41148c4babd3 100644 --- a/vhost_user.c +++ b/vhost_user.c @@ -914,7 +914,8 @@ static bool vu_get_protocol_features_exec(struct vu_dev *vdev, struct vhost_user_msg *msg) { uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | - 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; + 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD | + 1ULL << VHOST_USER_PROTOCOL_F_DEVICE_STATE; (void)vdev; vmsg_set_reply_u64(msg, features); -- 2.47.1