}
/**
diff --git a/passt.1 b/passt.1
index 29cc3ed..c81d539 100644
--- a/passt.1
+++ b/passt.1
@@ -418,6 +418,17 @@ Enable vhost-user. The vhost-user command socket is provided by
\fB--socket\fR.
.BR \-\-print-capabilities
Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
+.TP
+.BR \-\-repair-path " " \fIpath
+Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect
+to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during
+migration. \fB--repair-path none\fR disables this interface (if you need to
+specify a socket path called "none" you can prefix the path by \fI./\fR).
+
+Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
+chosen for the hypervisor UNIX domain socket. No socket is created if not in
+\-\-vhost-user mode.
+
.TP
.BR \-F ", " \-\-fd " " \fIFD
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
diff --git a/passt.c b/passt.c
index 935a69f..6f9fb4d 100644
--- a/passt.c
+++ b/passt.c
@@ -52,6 +52,7 @@
#include "ndp.h"
#include "vu_common.h"
#include "migrate.h"
+#include "repair.h"
#define EPOLL_EVENTS 8
@@ -76,6 +77,8 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
+ [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
+ [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@@ -358,6 +361,12 @@ loop:
case EPOLL_TYPE_VHOST_KICK:
vu_kick_cb(c.vdev, ref, &now);
break;
+ case EPOLL_TYPE_REPAIR_LISTEN:
+ repair_listen_handler(&c, eventmask);
+ break;
+ case EPOLL_TYPE_REPAIR:
+ repair_handler(&c, eventmask);
+ break;
default:
/* Can't happen */
ASSERT(0);
diff --git a/passt.h b/passt.h
index e73a5ac..c392be0 100644
--- a/passt.h
+++ b/passt.h
@@ -20,6 +20,7 @@ union epoll_ref;
#include "siphash.h"
#include "ip.h"
#include "inany.h"
+#include "migrate.h"
#include "flow.h"
#include "icmp.h"
#include "fwd.h"
@@ -193,6 +194,7 @@ struct ip6_ctx {
* @foreground: Run in foreground, don't log to stderr by default
* @nofile: Maximum number of open files (ulimit -n)
* @sock_path: Path for UNIX domain socket
+ * @repair_path: TCP_REPAIR helper path, can be "none", empty for default
* @pcap: Path for packet capture file
* @pidfile: Path to PID file, empty string if not configured
* @pidfile_fd: File descriptor for PID file, -1 if none
@@ -203,6 +205,8 @@ struct ip6_ctx {
* @epollfd: File descriptor for epoll instance
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
+ * @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any
+ * @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper
* @our_tap_mac: Pasta/passt's MAC on the tap link
* @guest_mac: MAC address of guest or namespace, seen or configured
* @hash_secret: 128-bit secret for siphash functions
@@ -247,6 +251,7 @@ struct ctx {
int foreground;
int nofile;
char sock_path[UNIX_PATH_MAX];
+ char repair_path[UNIX_PATH_MAX];
char pcap[PATH_MAX];
char pidfile[PATH_MAX];
@@ -263,6 +268,8 @@ struct ctx {
int epollfd;
int fd_tap_listen;
int fd_tap;
+ int fd_repair_listen;
+ int fd_repair;
unsigned char our_tap_mac[ETH_ALEN];
unsigned char guest_mac[ETH_ALEN];
uint64_t hash_secret[2];
diff --git a/repair.c b/repair.c
new file mode 100644
index 0000000..784b994
--- /dev/null
+++ b/repair.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio(a)redhat.com>
+ */
+
+#include <errno.h>
+#include <sys/uio.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "inany.h"
+#include "flow.h"
+#include "flow_table.h"
+
+#include "repair.h"
+
+#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+
+/* Pending file descriptors for next repair_flush() call, or command change */
+static int repair_fds[SCM_MAX_FD];
+
+/* Pending command: flush pending file descriptors if it changes */
+static int repair_cmd;
This should be typed as int8_t (see below for more details).
+
+/* Number of pending file descriptors set in @repair_fds */
+static int repair_nfds;
+
+/**
+ * repair_sock_init() - Start listening for connections on helper socket
+ * @c: Execution context
+ */
+void repair_sock_init(const struct ctx *c)
+{
+ union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN };
+ struct epoll_event ev = { 0 };
+
+ if (c->fd_repair_listen == -1)
+ return;
+
+ if (listen(c->fd_repair_listen, 0)) {
+ err_perror("listen() on repair helper socket, won't migrate");
+ return;
+ }
+
+ ref.fd = c->fd_repair_listen;
+ ev.events = EPOLLIN | EPOLLHUP | EPOLLET;
+ ev.data.u64 = ref.u64;
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev))
+ err_perror("repair helper socket epoll_ctl(), won't migrate");
+}
+
+/**
+ * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
+ * @c: Execution context
+ * @events: epoll events
+ */
+void repair_listen_handler(struct ctx *c, uint32_t events)
+{
+ union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
+ struct epoll_event ev = { 0 };
+ struct ucred ucred;
+ socklen_t len;
+
+ if (events != EPOLLIN) {
+ debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
+ events);
+ return;
+ }
+
+ len = sizeof(ucred);
+
+ /* Another client is already connected: accept and close right away. */
+ if (c->fd_repair != -1) {
+ int discard = accept4(c->fd_repair_listen, NULL, NULL,
+ SOCK_NONBLOCK);
+
+ if (discard == -1)
+ return;
+
+ if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
+ info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
+
+ close(discard);
+ return;
+ }
+
+ if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) {
+ debug_perror("accept4() on TCP_REPAIR helper listening socket");
+ return;
+ }
+
+ if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
+ info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
+
+ ref.fd = c->fd_repair;
+ ev.events = EPOLLHUP | EPOLLET;
+ ev.data.u64 = ref.u64;
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) {
+ debug_perror("epoll_ctl() on TCP_REPAIR helper socket");
+ close(c->fd_repair);
+ c->fd_repair = -1;
+ }
+}
+
+/**
+ * repair_close() - Close connection to TCP_REPAIR helper
+ * @c: Execution context
+ */
+void repair_close(struct ctx *c)
+{
+ debug("Closing TCP_REPAIR helper socket");
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL);
+ close(c->fd_repair);
+ c->fd_repair = -1;
+}
+
+/**
+ * repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket
+ * @c: Execution context
+ * @events: epoll events
+ */
+void repair_handler(struct ctx *c, uint32_t events)
+{
+ (void)events;
+
+ repair_close(c);
+}
+
+/**
+ * repair_flush() - Flush current set of sockets to helper, with current command
+ * @c: Execution context
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int repair_flush(struct ctx *c)
+{
+ struct iovec iov = { &((int8_t){ repair_cmd }), sizeof(int8_t) };
This will only be correct for little-endian machines. Better to
correctly type the repair_cmd variable.
+ char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
+ __attribute__ ((aligned(__alignof__(struct cmsghdr))));
+ struct cmsghdr *cmsg;
+ struct msghdr msg;
+
+ if (!repair_nfds)
+ return 0;
+
+ msg = (struct msghdr){ NULL, 0, &iov, 1,
+ buf, CMSG_SPACE(sizeof(int) * repair_nfds), 0 };
+ cmsg = CMSG_FIRSTHDR(&msg);
+
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds);
+ memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds);
+
+ repair_nfds = 0;
+
+ if (sendmsg(c->fd_repair, &msg, 0) < 0) {
+ int ret = -errno;
+ err_perror("Failed to send sockets to TCP_REPAIR helper");
+ repair_close(c);
+ return ret;
+ }
+
+ if (recv(c->fd_repair, &((int8_t){ 0 }), 1, 0) < 0) {
I guess it works, but passing an address to an implicitly constructed
variable to recv() makes me nervous. Besides we could error check a
bit better here, I'll try to send another fixup.
+ int ret = -errno;
+ err_perror("Failed to receive reply from TCP_REPAIR helper");
+ repair_close(c);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * repair_set() - Add socket to TCP_REPAIR set with given command
+ * @c: Execution context
+ * @s: Socket to add
+ * @cmd: TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+/* cppcheck-suppress unusedFunction */
+int repair_set(struct ctx *c, int s, int cmd)
+{
+ int rc;
+
+ if (repair_nfds && repair_cmd != cmd) {
+ if ((rc = repair_flush(c)))
+ return rc;
+ }
+
+ repair_cmd = cmd;
+ repair_fds[repair_nfds++] = s;
+
+ if (repair_nfds >= SCM_MAX_FD) {
+ if ((rc = repair_flush(c)))
+ return rc;
+ }
+
+ return 0;
+}
diff --git a/repair.h b/repair.h
new file mode 100644
index 0000000..de279d6
--- /dev/null
+++ b/repair.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio(a)redhat.com>
+ */
+
+#ifndef REPAIR_H
+#define REPAIR_H
+
+void repair_sock_init(const struct ctx *c);
+void repair_listen_handler(struct ctx *c, uint32_t events);
+void repair_handler(struct ctx *c, uint32_t events);
+void repair_close(struct ctx *c);
+int repair_flush(struct ctx *c);
+int repair_set(struct ctx *c, int s, int cmd);
+
+#endif /* REPAIR_H */
diff --git a/tap.c b/tap.c
index 8c92d23..d0673e5 100644
--- a/tap.c
+++ b/tap.c
@@ -56,6 +56,7 @@
#include "netlink.h"
#include "pasta.h"
#include "packet.h"
+#include "repair.h"
#include "tap.h"
#include "log.h"
#include "vhost_user.h"
@@ -1151,68 +1152,6 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
tap_pasta_input(c, now);
}
-/**
- * tap_sock_unix_open() - Create and bind AF_UNIX socket
- * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
- *
- * Return: socket descriptor on success, won't return on failure
- */
-int tap_sock_unix_open(char *sock_path)
-{
- int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
- struct sockaddr_un addr = {
- .sun_family = AF_UNIX,
- };
- int i;
-
- if (fd < 0)
- die_perror("Failed to open UNIX domain socket");
-
- for (i = 1; i < UNIX_SOCK_MAX; i++) {
- char *path = addr.sun_path;
- int ex, ret;
-
- if (*sock_path)
- memcpy(path, sock_path, UNIX_PATH_MAX);
- else if (snprintf_check(path, UNIX_PATH_MAX - 1,
- UNIX_SOCK_PATH, i))
- die_perror("Can't build UNIX domain socket path");
-
- ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
- 0);
- if (ex < 0)
- die_perror("Failed to check for UNIX domain conflicts");
-
- ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
- if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
- errno != EACCES)) {
- if (*sock_path)
- die("Socket path %s already in use", path);
-
- close(ex);
- continue;
- }
- close(ex);
-
- unlink(path);
- ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
- if (*sock_path && ret)
- die_perror("Failed to bind UNIX domain socket");
-
- if (!ret)
- break;
- }
-
- if (i == UNIX_SOCK_MAX)
- die_perror("Failed to bind UNIX domain socket");
-
- info("UNIX domain socket bound at %s", addr.sun_path);
- if (!*sock_path)
- memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
-
- return fd;
-}
-
/**
* tap_backend_show_hints() - Give help information to start QEMU
* @c: Execution context
@@ -1423,6 +1362,8 @@ void tap_backend_init(struct ctx *c)
tap_sock_tun_init(c);
break;
case MODE_VU:
+ repair_sock_init(c);
+ /* fall through */
case MODE_PASST:
tap_sock_unix_init(c);
diff --git a/util.c b/util.c
index 4d51e04..c3c5480 100644
--- a/util.c
+++ b/util.c
@@ -178,6 +178,68 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return fd;
}
+/**
+ * sock_unix() - Create and bind AF_UNIX socket
+ * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
+ *
+ * Return: socket descriptor on success, won't return on failure
+ */
+int sock_unix(char *sock_path)
+{
+ int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ int i;
+
+ if (fd < 0)
+ die_perror("Failed to open UNIX domain socket");
+
+ for (i = 1; i < UNIX_SOCK_MAX; i++) {
+ char *path = addr.sun_path;
+ int ex, ret;
+
+ if (*sock_path)
+ memcpy(path, sock_path, UNIX_PATH_MAX);
+ else if (snprintf_check(path, UNIX_PATH_MAX - 1,
+ UNIX_SOCK_PATH, i))
+ die_perror("Can't build UNIX domain socket path");
+
+ ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+ 0);
+ if (ex < 0)
+ die_perror("Failed to check for UNIX domain conflicts");
+
+ ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
+ if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
+ errno != EACCES)) {
+ if (*sock_path)
+ die("Socket path %s already in use", path);
+
+ close(ex);
+ continue;
+ }
+ close(ex);
+
+ unlink(path);
+ ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
+ if (*sock_path && ret)
+ die_perror("Failed to bind UNIX domain socket");
+
+ if (!ret)
+ break;
+ }
+
+ if (i == UNIX_SOCK_MAX)
+ die_perror("Failed to bind UNIX domain socket");
+
+ info("UNIX domain socket bound at %s", addr.sun_path);
+ if (!*sock_path)
+ memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
+
+ return fd;
+}
+
/**
* sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
* @c: Execution context
diff --git a/util.h b/util.h
index 255eb26..3dacb4d 100644
--- a/util.h
+++ b/util.h
@@ -214,6 +214,7 @@ struct ctx;
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
const char *ifname, bool v6only, uint32_t data);
+int sock_unix(char *sock_path);
void sock_probe_mem(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);