...and time out after that. This will be needed because of an upcoming change to passt-repair enabling it to start before passt is started, on both source and target, by means of an inotify watch. Once the inotify watch triggers, passt-repair will connect right away, but we have no guarantees that the connection completes before we start the migration process, so wait for it (for a reasonable amount of time). Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- v2: - Use 10 ms as timeout instead of 100 ms. Given that I'm unable to migrate a simple guest with 256 MiB of memory and no storage other than an initramfs in less than 4 milliseconds, at least on my test system (rather fast CPU threads and memory interface), I think that 10 ms shouldn't make a big difference in case passt-repair is not available for whatever reason - Move the static assert next to the initialisation of 'tv' flow.c | 20 ++++++++++++++++++++ repair.c | 31 +++++++++++++++++++++++++++++++ repair.h | 1 + 3 files changed, 52 insertions(+) diff --git a/flow.c b/flow.c index 749c498..5e64b79 100644 --- a/flow.c +++ b/flow.c @@ -911,6 +911,21 @@ static int flow_migrate_source_rollback(struct ctx *c, unsigned bound, int ret) return ret; } +/** + * flow_migrate_need_repair() - Do we need to set repair mode for any flow? + * + * Return: true if repair mode is needed, false otherwise + */ +static bool flow_migrate_need_repair(void) +{ + union flow *flow; + + foreach_established_tcp_flow(flow) + return true; + + return false; +} + /** * flow_migrate_repair_all() - Turn repair mode on or off for all flows * @c: Execution context @@ -966,6 +981,9 @@ int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage, (void)stage; (void)fd; + if (flow_migrate_need_repair()) + repair_wait(c); + if ((rc = flow_migrate_repair_all(c, true))) return -rc; @@ -1083,6 +1101,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, if (!count) return 0; + repair_wait(c); + if ((rc = flow_migrate_repair_all(c, true))) return -rc; diff --git a/repair.c b/repair.c index 3ee089f..ebeb248 100644 --- a/repair.c +++ b/repair.c @@ -27,6 +27,10 @@ #define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */ +/* Wait for a while for TCP_REPAIR helper to connect if it's not there yet */ +#define REPAIR_ACCEPT_TIMEOUT_MS 10 +#define REPAIR_ACCEPT_TIMEOUT_US (REPAIR_ACCEPT_TIMEOUT_MS * 1000) + /* Pending file descriptors for next repair_flush() call, or command change */ static int repair_fds[SCM_MAX_FD]; @@ -138,6 +142,33 @@ void repair_handler(struct ctx *c, uint32_t events) repair_close(c); } +/** + * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect + * @c: Execution context + */ +void repair_wait(struct ctx *c) +{ + struct timeval tv = { .tv_sec = 0, + .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) }; + static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000); + + if (c->fd_repair >= 0 || c->fd_repair_listen == -1) + return; + + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) { + err_perror("Set timeout on TCP_REPAIR listening socket"); + return; + } + + repair_listen_handler(c, EPOLLIN); + + tv.tv_usec = 0; + if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv))) + err_perror("Clear timeout on TCP_REPAIR listening socket"); +} + /** * repair_flush() - Flush current set of sockets to helper, with current command * @c: Execution context diff --git a/repair.h b/repair.h index de279d6..1d37922 100644 --- a/repair.h +++ b/repair.h @@ -10,6 +10,7 @@ void repair_sock_init(const struct ctx *c); void repair_listen_handler(struct ctx *c, uint32_t events); void repair_handler(struct ctx *c, uint32_t events); void repair_close(struct ctx *c); +void repair_wait(struct ctx *c); int repair_flush(struct ctx *c); int repair_set(struct ctx *c, int s, int cmd); -- 2.43.0