When a user spawns a command with pasta they expect the network to be
ready. Currently this does not work because pasta will fork/exec
before it will setup the network config.
This patch fixes it by using a pipe to sync parent and child. The child
will now block reading from this pipe before the exec call. The parent
will then unblock the child only after the netns was configured.
A command like `pasta --config-net -- ping -c1 1.1.1.1` can now
actually work as expected.
Signed-off-by: Paul Holzinger <pholzing(a)redhat.com>
---
passt.c | 9 ++++++++-
passt.h | 3 +++
pasta.c | 31 ++++++++++++++++++++++++++++---
3 files changed, 39 insertions(+), 4 deletions(-)
diff --git a/passt.c b/passt.c
index 8b2c50d..4ef5797 100644
--- a/passt.c
+++ b/passt.c
@@ -187,7 +187,8 @@ int main(int argc, char **argv)
isolate_initial();
- c.pasta_netns_fd = c.fd_tap = c.fd_tap_listen = -1;
+ c.pasta_netns_fd = c.pasta_command_ready_fd =
+ c.fd_tap = c.fd_tap_listen = -1;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
@@ -296,6 +297,12 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
+ /* start pasta child process now after the netns is setup */
+ if (c.pasta_command_ready_fd > -1) {
+ /* close causes EOF for the read in the child so no need to write() */
+ close(c.pasta_command_ready_fd);
+ }
+
if (!c.foreground)
__daemon(pidfile_fd, devnull_fd);
else
diff --git a/passt.h b/passt.h
index 3d7e567..a78cd81 100644
--- a/passt.h
+++ b/passt.h
@@ -154,6 +154,8 @@ struct ip6_ctx {
* @pcap: Path for packet capture file
* @pid_file: Path to PID file, empty string if not configured
* @pasta_netns_fd: File descriptor for network namespace in pasta mode
+ * @pasta_command_ready_fd: File descriptor for the ready pipe to
+ * start child cmd, -1 if not used
* @no_netns_quit: In pasta mode, don't exit if fs-bound namespace is gone
* @netns_base: Base name for fs-bound namespace, if any, in pasta mode
* @netns_dir: Directory of fs-bound namespace, if any, in pasta mode
@@ -205,6 +207,7 @@ struct ctx {
int one_off;
int pasta_netns_fd;
+ int pasta_command_ready_fd;
int no_netns_quit;
char netns_base[PATH_MAX];
diff --git a/pasta.c b/pasta.c
index 528f02a..56ac326 100644
--- a/pasta.c
+++ b/pasta.c
@@ -149,16 +149,19 @@ void pasta_open_ns(struct ctx *c, const char *netns)
/**
* struct pasta_spawn_cmd_arg - Argument for pasta_spawn_cmd()
- * @exe: Executable to run
- * @argv: Command and arguments to run
+ * @exe: Executable to run
+ * @argv: Command and arguments to run
+ * @ready_pipe: Ready pipe pair from parent.
*/
struct pasta_spawn_cmd_arg {
const char *exe;
char *const *argv;
+ int ready_pipe[2];
};
/**
- * pasta_spawn_cmd() - Prepare new netns, start command or shell
+ * pasta_spawn_cmd() - Prepare new netns, spawn child, wait for parent,
+ * then exec command or shell
* @arg: See @pasta_spawn_cmd_arg
*
* Return: this function never returns
@@ -166,11 +169,24 @@ struct pasta_spawn_cmd_arg {
static int pasta_spawn_cmd(void *arg)
{
const struct pasta_spawn_cmd_arg *a;
+ char buf[1];
if (write_file("/proc/sys/net/ipv4/ping_group_range", "0 0"))
warn("Cannot set ping_group_range, ICMP requests might fail");
a = (const struct pasta_spawn_cmd_arg *)arg;
+
+ /* close write side, we want read to return EOF when parent closes the fd */
+ close(a->ready_pipe[1]);
+
+ /* wait here for parent setup to finish before we exec */
+ if (TEMP_FAILURE_RETRY(read(a->ready_pipe[0], buf, sizeof(buf))) < 0) {
+ err("Failed to read ready pipe from parent: %s",
+ strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ close(a->ready_pipe[0]);
+
execvp(a->exe, a->argv);
perror("execvp");
@@ -226,6 +242,13 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
arg.argv = sh_argv;
}
+ if (pipe(arg.ready_pipe) < 0) {
+ err("Create ready pipe: %s", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+
+ c->pasta_command_ready_fd = arg.ready_pipe[1];
+
pasta_child_pid = do_clone(pasta_spawn_cmd, ns_fn_stack,
sizeof(ns_fn_stack),
CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET |
@@ -237,6 +260,8 @@ void pasta_start_ns(struct ctx *c, uid_t uid, gid_t gid,
exit(EXIT_FAILURE);
}
+ close(arg.ready_pipe[0]);
+
NS_CALL(pasta_wait_for_ns, c);
}
--
2.39.1
Hi folks,
libslirp and Passt have different approaches to sharing DNS resolvers with
the guest system, each with their own benefits & drawbacks. On the libslirp
project, we're discussing [1] how to support DNS failover. Passt already has
support for this, but there is a drawback to its solution which prevents us
from taking a similar approach: the resolvers are read exactly once, so if the
host changes networks at runtime, the guest will not receive the updated
resolvers and thus its connectivity will break.
libslirp's current approach is to DNAT a single address exposed to the guest
to one of the resolvers configured on the host. The problem here is that if that
one resolver goes down, the guest can't resolve DNS names. We're
considering changing so that instead of a single address, we expose a set of
MAXNS addresses, and DNAT those 1:1 to the DNS resolvers registered with
the host. Because the DNAT table lives on the host side, we can refresh the
guest's resolvers whenever the host's resolvers change, but without the need to
expire a DHCP lease (even with short leases, the guest will still lose
connectivity
for a time).
Does this sound like an approach Passt would be open to adopting as well?
Thanks.
- Noah
[1] https://gitlab.freedesktop.org/slirp/libslirp/-/issues/26
Hi all,
while debugging some things I used `./pasta --config-net -- nslookup
google.com 1.1.1.1` to test dns.
The problem is that does not work because the nslookup process will be
executed before pasta is
ready with the netns setup, i.e. compare `./pasta --config-net -- ip a`.
So a workaround is to spawn a shell and sleep: `sh -c "sleep 1; nslookup
google.com 1.1.1.1"`
However this is ugly and does not ensure that the netns is ready after
one second. As a user
I would expect pasta to wait until the setup is finished before it calls
exec().
I can send a patch if you agree and I find some time.
---
Paul
This is a handful of simple cleanups which I made while investigating
https://bugs.passt.top/show_bug.cgi?id=41. Note that these don't
themselves actually address that bug, they're just unrelated cleanups
that happen to be in adjacent code.
These are barely tested at all. I've had some crises right before
going away that mean I haven't had a chance to plish these. I'm
posting them so they're out there rather than waiting until I'm back
in two weeks.
These are based on my tap send unification series and the TCP socket
pool cleanup series.
David Gibson (4):
tap: Don't pcap frames that didn't get sent
tap: Eliminate goto from tap_handler()
tcp: Remove 'recvmsg' goto from tcp_data_from_sock
tcp: Remove 'zero_len' goto from tcp_data_from_sock
tap.c | 49 +++++++++++++++++++++++++++----------------------
tcp.c | 37 +++++++++++++++++--------------------
2 files changed, 44 insertions(+), 42 deletions(-)
--
2.39.1
Although we have an abstraction for the "slow path" (DHCP, NDP) guest
bound packets, the TCP and UDP forwarding paths write directly to the
tap fd. However, it turns out how they send frames to the tap device
is more similar than it originally appears.
This series unifies the low-level tap send functions for TCP and UDP,
and makes some clean ups along the way.
This is based on my earlier outstanding series.
Changes since v2:
* Rebase on the latest version of the UDP tap/splice socket
unification
* Rework pcap_frame() to return an error rather than printing itself,
restoring less verbose behaviour for one error in a group of
frames.
* Assorted typo fixes in comments and commit messages
Changes since v1:
* Abstract tap header construction as well as frame send (a number of
new patches)
* Remove unneeded flags buf_bytes globals as well
* Fix bug where we weren't correctly setting iov_len after the move
to giving variable sized iovecs to send_frames().
David Gibson (18):
pcap: Introduce pcap_frame() helper
pcap: Replace pcapm() with pcap_multiple()
tcp: Combine two parts of passt tap send path together
tcp: Don't compute total bytes in a message until we need it
tcp: Improve interface to tcp_l2_buf_flush()
tcp: Combine two parts of pasta tap send path together
tap, tcp: Move tap send path to tap.c
util: Introduce hton*_constant() in place of #ifdefs
tcp, udp: Use named field initializers in iov_init functions
util: Parameterize ethernet header initializer macro
tcp: Remove redundant and incorrect initialization from *_iov_init()
tcp: Consolidate calculation of total frame size
tap: Add "tap headers" abstraction
tcp: Use abstracted tap header
tap: Use different io vector bases depending on tap type
udp: Use abstracted tap header
tap: Improve handling of partial frame sends
udp: Use tap_send_frames()
dhcpv6.c | 50 +++--------
pcap.c | 86 ++++++-------------
pcap.h | 3 +-
tap.c | 121 ++++++++++++++++++++++++++
tap.h | 54 ++++++++++++
tcp.c | 254 +++++++++++++------------------------------------------
udp.c | 213 ++++++----------------------------------------
udp.h | 2 +-
util.h | 47 ++--------
9 files changed, 310 insertions(+), 520 deletions(-)
--
2.39.0
This is a bit of a diversion from what I'm notionally working on at
the moment. While thinking about what we'd need to use the
IP_TRANSPARENT socket option to broaden the cases where we can
"splice", I noticed some inelegancies in how we handle the pool of
pre-opened sockets in the TCP code, and well, here we are.
This makes a number of cleanups to the handling of these pools. Most
notably, tcp_splice_connect() now has simpler semantics: it now always
runs in the init namespace, and is always given a pre-opened socket
(which could be in the guest ns).
David Gibson (5):
tcp: Make a helper to refill each socket pool
tcp: Split init and ns cases for tcp_sock_refill()
tcp: Move socket pool declarations around
tcp: Split pool lookup from creating new sockets in
tcp_conn_new_sock()
tcp: Improve handling of fallback if socket pool is empty on new
splice
tcp.c | 138 ++++++++++++++++++-------------------------------
tcp.h | 2 -
tcp_conn.h | 12 ++++-
tcp_splice.c | 141 ++++++++++++++++++++++++++-------------------------
4 files changed, 132 insertions(+), 161 deletions(-)
--
2.39.0
At present, the UDP "splice" and "tap" paths are quite separate. We
have separate sockets to receive packets bound for the tap and splice
paths. This leads to some code duplication, and extra open sockets.
This series partially unifies the two paths, allowing us to use a
single (host side) socket, bound to 0.0.0.0 or :: to receive packets
for both cases.
Changes since v3:
* Fixed really dumb compile error, and actually ran through the tests.
Oops.
Changes since v2:
* Don't receive multiple packets at once for pasta mode - seems to
hurt throughput on balance.
* Add some comments clarifying reasoning here.
Changes since v1:
* Renamed udp_localname[46] to udp[46]_localname
* Allow handling of UDP port 0
* Fix a bug which could misidentify certain v6 packets as v4-spliceable
* Some minor cosmetic fixes to code and commit messages
David Gibson (8):
udp: Move sending pasta tap frames to the end of udp_sock_handler()
udp: Split sending to passt tap interface into separate function
udp: Split receive from preparation and send in udp_sock_handler()
udp: Don't handle tap receive batch size calculation within a #define
udp: Pre-populate msg_names with local address
udp: Unify udp_sock_handler_splice() with udp_sock_handler()
udp: Decide whether to "splice" per datagram rather than per socket
udp: Don't use separate sockets to listen for spliced packets
udp.c | 388 ++++++++++++++++++++++++++++++---------------------------
udp.h | 2 +-
util.h | 7 ++
3 files changed, 213 insertions(+), 184 deletions(-)
--
2.39.0
passt (https://passt.top) provides a method for connecting a guest to
the larger network without requiring any elevated privileges. This set
of patches allows libvirt/QEMU users to easily configure a QEMU domain to
use passt for the backend of any emulated network interface.
More details are in the individual patches, but the short explanation is that
you will use:
<interface type='user'>
<backend type='passt'>
...
to select the passt backend. (We decided to do it this way since the
concept is so similar to slirp, which was the original "type='user'")
The following caveats currently apply:
1) passt support requires "-netdev stream" in QEMU, which is only
available starting with qemu-7.2.0. So if you want to test these
patches out, you need the latest upstream release of QEMU.
2) SELinux must be set to "permissive". This is of course
temporary. As I understand it, the remedy to this is a new SELinux
profile for the passt binary, which is outside the control of
libvirt and so not something that can be addressed in this patchset
(or any other patch to libvirt).
3) Although there is a a new option for QEMU's -netdev that will tell
QEMU to attempt to reconnect to a new incarnation of the same
socket if passt unexpectedly exits, and a new QEMU event that will
be put into QEMU to inform libvirt that the passt process has
exited (so that it can start a new, identical passt process), I
think this hasn't been pushed upstream yet (??), and I haven't
implemented any support for it here. So, if the passt process
unexpectedly exits, the guest will be without networking. However,
Stefano (passt author) is emphatic that passt will never
unexpectedly exit :-)
passt has *many* other options that libvirt could support, but the
small subset here are the things that seem most useful (and/or were
specifically requested by prospective users of passt). It is always
easier to add more stuff in the future than to remove "mistakes", so I
tried to no go overboard.
Laine Stump (9):
conf: rename virDomainNetBackend* to virDomainNetDriver*
conf: move anonymous backend struct from virDomainNetDef into its own
struct
conf: put interface <backend> parsing/formatting separate functions
conf: add passt XML additions to schema
conf: parse/format passt-related XML additions
qemu: new capability QEMU_CAPS_NETDEV_STREAM
qemu: add passtStateDir to qemu driver config
qemu: hook up passt config to qemu domains
specfile: require passt for the build if fedora >= 36 or rhel >= 9
docs/formatdomain.rst | 95 +++++-
libvirt.spec.in | 7 +
meson.build | 1 +
po/POTFILES | 1 +
src/conf/domain_conf.c | 303 ++++++++++++++++--
src/conf/domain_conf.h | 64 +++-
src/conf/domain_validate.c | 32 +-
src/conf/schemas/domaincommon.rng | 65 ++++
src/conf/virconftypes.h | 6 +
src/libvirt_private.syms | 1 +
src/qemu/meson.build | 2 +
src/qemu/qemu_capabilities.c | 4 +
src/qemu/qemu_capabilities.h | 3 +
src/qemu/qemu_command.c | 11 +-
src/qemu/qemu_command.h | 3 +-
src/qemu/qemu_conf.c | 2 +
src/qemu/qemu_conf.h | 1 +
src/qemu/qemu_domain.c | 5 +-
src/qemu/qemu_domain.h | 3 +-
src/qemu/qemu_driver.c | 12 +
src/qemu/qemu_extdevice.c | 25 +-
src/qemu/qemu_hotplug.c | 26 +-
src/qemu/qemu_interface.c | 8 +-
src/qemu/qemu_passt.c | 284 ++++++++++++++++
src/qemu/qemu_passt.h | 38 +++
src/qemu/qemu_process.c | 1 +
src/qemu/qemu_validate.c | 9 +-
src/security/virt-aa-helper.c | 2 +-
.../caps_7.2.0.x86_64.xml | 1 +
tests/qemuxml2argvdata/net-user-passt.args | 34 ++
.../net-user-passt.x86_64-latest.args | 37 +++
tests/qemuxml2argvdata/net-user-passt.xml | 57 ++++
tests/qemuxml2argvtest.c | 2 +
tests/qemuxml2xmloutdata/net-user-passt.xml | 1 +
tests/qemuxml2xmltest.c | 1 +
35 files changed, 1087 insertions(+), 60 deletions(-)
create mode 100644 src/qemu/qemu_passt.c
create mode 100644 src/qemu/qemu_passt.h
create mode 100644 tests/qemuxml2argvdata/net-user-passt.args
create mode 100644 tests/qemuxml2argvdata/net-user-passt.x86_64-latest.args
create mode 100644 tests/qemuxml2argvdata/net-user-passt.xml
create mode 120000 tests/qemuxml2xmloutdata/net-user-passt.xml
--
2.38.1
The proper syntax is:
${start}-${end}:${dststart}-${dstend}
not
${start}-${end}-${dststart}:${dstend}
Signed-off-by: Laine Stump <laine(a)redhat.com>
---
passt.1 | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/passt.1 b/passt.1
index 528763b..0dad878 100644
--- a/passt.1
+++ b/passt.1
@@ -349,7 +349,7 @@ Forward local ports 22 and 25 to ports 22 and 25 on the guest
-t 22-80
Forward local ports 22 to 80 to corresponding ports on the guest
.TP
--t 22-80-32:90
+-t 22-80:32-90
Forward local ports 22 to 80 to corresponding ports on the guest plus 10
.TP
-t 192.0.2.1/22
@@ -361,7 +361,7 @@ Forward local port 22, bound to 192.0.2.1 and interface eth0, to port 22
-t 2000-5000,~3000-3010
Forward local ports 2000 to 5000, but not 3000 to 3010
.TP
--t 192.0.2.1/20-30,~25
+-t 192.0.2.1/20-26:24-30,~25-29
Forward local ports 20 to 24, and 26 to 30, bound to 192.0.2.1
.TP
-t ~20000-20010
@@ -427,7 +427,7 @@ Forward local ports 22 and 25 to ports 22 and 25 in the target namespace
-t 22-80
Forward local ports 22 to 80 to corresponding ports in the target namespace
.TP
--t 22-80-32:90
+-t 22-80:32-90
Forward local ports 22 to 80 to corresponding ports plus 10 in the target
namespace
.TP
@@ -440,7 +440,7 @@ Forward local port 22, bound to 192.0.2.1 and interface eth0, to port 22
-t 2000-5000,~3000-3010
Forward local ports 2000 to 5000, but not 3000 to 3010
.TP
--t 192.0.2.1/20-30,~25
+-t 192.0.2.1/20-26:24-30,~25-29
Forward local ports 20 to 24, and 26 to 30, bound to 192.0.2.1
.TP
-t ~20000-20010
--
2.38.1