The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This makes it possible to avoid repeated reading of already read initial bytes of a received message, hence saving us read cycles when forwarding TCP messages in the host->name space direction. When this feature is supported, iov_sock[0].iov_base can be set to NULL. The kernel code will interpret this as an instruction to skip reading of the first iov_sock[0].iov_len bytes of the message. Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this by making this into a pointer, setting it to NULL if we find that the feature is supported by the kernel, an letting it point to a static buffer if not. There is no macro or function indicating kernel support for this feature. We therefore have to probe for it by creating a temporary tcp connection and read from it as if the feature is present. If that fails, we fall back to the original design. The traffic connection cannot be used for this purpose, because it will be broken if the probe reading fails. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: Changes as suggested by Stefano Brivio: - Moved probe process/function into a separate, temporary name space, to avoid occupying port numbers in the current name space. - Put discard buffer back to static memory. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- netlink.c | 2 +- netlink.h | 1 + tcp.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/netlink.c b/netlink.c index 379d46e..d5f10de 100644 --- a/netlink.c +++ b/netlink.c @@ -55,7 +55,7 @@ static int nl_seq = 1; * * Return: 0 */ -static int nl_sock_init_do(void *arg) +int nl_sock_init_do(void *arg) { struct sockaddr_nl addr = { .nl_family = AF_NETLINK, }; int *s = arg ? &nl_sock_ns : &nl_sock; diff --git a/netlink.h b/netlink.h index 3a1f0de..cadd3b7 100644 --- a/netlink.h +++ b/netlink.h @@ -24,5 +24,6 @@ int nl_addr_dup(int s_src, unsigned int ifi_src, int nl_link_get_mac(int s, unsigned int ifi, void *mac); int nl_link_set_mac(int s, unsigned int ifi, const void *mac); int nl_link_up(int s, unsigned int ifi, int mtu); +int nl_sock_init_do(void *arg); #endif /* NETLINK_H */ diff --git a/tcp.c b/tcp.c index f506cfd..4410460 100644 --- a/tcp.c +++ b/tcp.c @@ -283,6 +283,7 @@ #include <sys/timerfd.h> #include <sys/types.h> #include <sys/uio.h> +#include <sys/wait.h> #include <time.h> #include <linux/tcp.h> /* For struct tcp_info */ @@ -297,6 +298,7 @@ #include "log.h" #include "inany.h" #include "flow.h" +#include "netlink.h" #include "tcp_conn.h" #include "flow_table.h" @@ -402,6 +404,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) +static bool tcp_probe_msg_peek_offset_cap(); + static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -506,7 +510,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; +static char tcp_discard_buf[MAX_WINDOW]; +static char* tcp_buf_discard = tcp_discard_buf; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; @@ -573,6 +578,15 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp)) + +/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset? + */ +static inline bool msg_peek_offset_cap() +{ + return !tcp_buf_discard; +} + + /** conn_at_idx() - Find a connection by index, if present * @idx: Index of connection to lookup * @@ -2224,7 +2238,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!msg_peek_offset_cap()) + sendlen -= already_sent; if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -3107,6 +3123,9 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Ignore discard buffer if not needed */ + if (tcp_probe_msg_peek_offset_cap()) + tcp_buf_discard = NULL; return 0; } @@ -3213,3 +3232,83 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) if (c->mode == MODE_PASTA) tcp_splice_refill(c); } + +static int tcp_probe_sockets(void *arg) +{ + char b; + struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, }; + struct msghdr msg = { NULL, 0, iov, 2, NULL, 0, 0}; + struct sockaddr a = { AF_INET, {0, }}; + int err = EXIT_FAILURE; + int s[2] = {0, }; + int s_recv = 0; + int *rc = arg; + ssize_t len; + + /* Make sure loopback interface is enabled */ + nl_sock_init_do(NULL); + nl_link_up(nl_sock, 1 /* lo */, 0); + + s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (s[0] < 0 || s[1] < 0) { + perror("Temporary probe socket creation failed\n"); + goto out; + } + if (0 > bind(s[0], &a, sizeof(a))) { + perror("Temporary probe socket bind() failed\n"); + goto out; + } + if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) { + perror("Temporary probe socket getsockname() failed\n"); + goto out; + } + if (0 > listen(s[0], 0)) { + perror("Temporary probe socket listen() failed\n"); + goto out; + } + if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) { + perror("Temporary probe socket connect() failed\n"); + goto out; + } + s_recv = accept(s[0], NULL, NULL); + if (s_recv <= 0) { + perror("Temporary probe socket accept() failed\n"); + goto out; + } + if (0 >= send(s[1], (char *)("ab"), 2, 0) || errno != EINPROGRESS) { + perror("Temporary probe socket send() failed\n"); + goto out; + } + len = recvmsg(s_recv, &msg, MSG_PEEK); + if (len <= 0 && errno != EFAULT) { + perror("Temporary probe socket recvmsg() failed\n"); + goto out; + } + printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not "); + *rc = len == 1; + err = EXIT_SUCCESS; +out: + close(s_recv); + close(s[1]); + close(s[0]); + return err; +} + +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support + */ +static bool tcp_probe_msg_peek_offset_cap() +{ + char ns_fn_stack[NS_FN_STACK_SIZE]; + int child_pid, child_ret, rc = 0; + + child_pid = do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack), + CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, + (void *)&rc); + if (child_pid <= 0) { + perror("Failed to clone tcp probe process\n"); + exit(EXIT_FAILURE); + } + waitpid(child_pid, &child_ret, 0); + return rc; +} -- 2.39.2
On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote:The kernel may support recvmsg(MSG_PEEK), starting from a given offset. This makes it possible to avoid repeated reading of already read initial bytes of a received message, hence saving us read cycles when forwarding TCP messages in the host->name space direction. When this feature is supported, iov_sock[0].iov_base can be set to NULL. The kernel code will interpret this as an instruction to skip reading of the first iov_sock[0].iov_len bytes of the message. Since iov_sock[0].iov_base is set to point to tcp_buf_discard, we do this by making this into a pointer, setting it to NULL if we find that the feature is supported by the kernel, an letting it point to a static buffer if not. There is no macro or function indicating kernel support for this feature. We therefore have to probe for it by creating a temporary tcp connection and read from it as if the feature is present. If that fails, we fall back to the original design. The traffic connection cannot be used for this purpose, because it will be broken if the probe reading fails. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- v2: Changes as suggested by Stefano Brivio: - Moved probe process/function into a separate, temporary name space, to avoid occupying port numbers in the current name space. - Put discard buffer back to static memory. Signed-off-by: Jon Maloy <jmaloy(a)redhat.com> --- netlink.c | 2 +- netlink.h | 1 + tcp.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/netlink.c b/netlink.c index 379d46e..d5f10de 100644 --- a/netlink.c +++ b/netlink.c @@ -55,7 +55,7 @@ static int nl_seq = 1; * * Return: 0 */ -static int nl_sock_init_do(void *arg) +int nl_sock_init_do(void *arg) { struct sockaddr_nl addr = { .nl_family = AF_NETLINK, }; int *s = arg ? &nl_sock_ns : &nl_sock; diff --git a/netlink.h b/netlink.h index 3a1f0de..cadd3b7 100644 --- a/netlink.h +++ b/netlink.h @@ -24,5 +24,6 @@ int nl_addr_dup(int s_src, unsigned int ifi_src, int nl_link_get_mac(int s, unsigned int ifi, void *mac); int nl_link_set_mac(int s, unsigned int ifi, const void *mac); int nl_link_up(int s, unsigned int ifi, int mtu); +int nl_sock_init_do(void *arg); #endif /* NETLINK_H */ diff --git a/tcp.c b/tcp.c index f506cfd..4410460 100644 --- a/tcp.c +++ b/tcp.c @@ -283,6 +283,7 @@ #include <sys/timerfd.h> #include <sys/types.h> #include <sys/uio.h> +#include <sys/wait.h> #include <time.h> #include <linux/tcp.h> /* For struct tcp_info */ @@ -297,6 +298,7 @@ #include "log.h" #include "inany.h" #include "flow.h" +#include "netlink.h" #include "tcp_conn.h" #include "flow_table.h" @@ -402,6 +404,8 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) +static bool tcp_probe_msg_peek_offset_cap();We usually prefer to re-order functions in the file, rather than use forward declarations. That said, I'm not sure I'd put the probing logic in tcp.c, but I'm not entirely sure where I would put it.+ static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -506,7 +510,8 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ -static char tcp_buf_discard [MAX_WINDOW]; +static char tcp_discard_buf[MAX_WINDOW]; +static char* tcp_buf_discard = tcp_discard_buf;tcp_discard_buf vs. tcp_buf_discard seems very easy to confuse. Maybe tcp_discard_buf and tcp_discard_ptr? Or, since you need explicit tests on the capability boolean anyway, just open code putting either tcp_buf_discard or NULL into the iovec.static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; @@ -573,6 +578,15 @@ static unsigned int tcp6_l2_flags_buf_used; #define CONN(idx) (&(FLOW(idx)->tcp)) + +/** msg_peek_offset_cap() - Does the kernel support recvmsg(MSG_PEEK) with offset?Needs a "Return:" comment.+ */ +static inline bool msg_peek_offset_cap()msg_peek_offset_cap() vs. tcp_probe_msg_peek_offset_cap() also seems very easy to confuse.+{ + return !tcp_buf_discard; +} + + /** conn_at_idx() - Find a connection by index, if present * @idx: Index of connection to lookup * @@ -2224,7 +2238,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!msg_peek_offset_cap()) + sendlen -= already_sent;Hmmm.. I think this is implying that when using this feature the recvmsg() call won't count the discarded first segment in the total received length returned. That doesn't seem quite correct to me. It would make sense if we did have an explicit offset sent, but at least the way I think of this NULL buffer feature, we're logically doing the full recvmsg(), but choosing to send some of the data to a black hole. That suggests to me it should return the same value whether or not one of the buffers is NULL. Perhaps more to the point, I think keeping the same return value is strictly more useful: we don't need it here, but I can imagine users of this feature where they want to discard the next n bytes of data, but they don't yet know if that data has already been received at the kernel level. So, even if there isn't enough data to make it into the second buffer - so it's all discarded - they want to know how many bytes it was that were discarded.if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -3107,6 +3123,9 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Ignore discard buffer if not needed */ + if (tcp_probe_msg_peek_offset_cap()) + tcp_buf_discard = NULL; return 0; } @@ -3213,3 +3232,83 @@ void tcp_timer(struct ctx *c, const struct timespec *ts) if (c->mode == MODE_PASTA) tcp_splice_refill(c); } +Function header comment would be good.+static int tcp_probe_sockets(void *arg) +{ + char b;passt uses tab, not space indents.+ struct iovec iov[2] = { { NULL, 1 }, { &b, 1 }, }; + struct msghdr msg = { NULL, 0, iov, 2, NULL, 0, 0}; + struct sockaddr a = { AF_INET, {0, }};I think you want sockaddr_in here. I don't believe plain sockaddr is guaranteed long enough to hold an IP address, though it probably is in practice.+ int err = EXIT_FAILURE; + int s[2] = {0, };No need to initialise this, you overwrite both elements anyway. Also, you do entirely different things with the two elements, so it might as well be different variables (as you have with s_recv).+ int s_recv = 0;Again, no need for an initializer, you unconditionally overwrite this.+ int *rc = arg; + ssize_t len; + + /* Make sure loopback interface is enabled */ + nl_sock_init_do(NULL);Ugh.. this will overwrite the nl_sock global with the netlink socket in your temporary namespace. We might get away with that if you do this early enough, but it's pretty counter-intuitive. I think we want to refactor no_sock_init_do() as one function that just returns the new netlink socket, with another helper that does the ns entering we need for the pasta namespace. Here you could just use that base function and put the temporary socket for the temporary ns into a local.+ nl_link_up(nl_sock, 1 /* lo */, 0); + + s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (s[0] < 0 || s[1] < 0) { + perror("Temporary probe socket creation failed\n"); + goto out; + } + if (0 > bind(s[0], &a, sizeof(a))) {Since the socket address is unspecified, why do you need to bind at all? It might be clearer to explicitly set a to localhost + a specific port - because you're in a temporary namespace, you can rely on every port being available.+ perror("Temporary probe socket bind() failed\n"); + goto out; + } + if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) { + perror("Temporary probe socket getsockname() failed\n"); + goto out; + } + if (0 > listen(s[0], 0)) { + perror("Temporary probe socket listen() failed\n"); + goto out; + } + if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) { + perror("Temporary probe socket connect() failed\n"); + goto out; + }This is assuming that a will now contain the correct address to connect to. Although it will have the right port, I think the address may still be unspecified for the listening socket.+ s_recv = accept(s[0], NULL, NULL); + if (s_recv <= 0) {Should be strictly < 0 (although it's very unlikely to occur in practice).+ perror("Temporary probe socket accept() failed\n"); + goto out; + } + if (0 >= send(s[1], (char *)("ab"), 2, 0) || errno != EINPROGRESS) {IIUC this is only checking errno in the case where send() did *not* return <= 0, and therefore doesn't contain any relevant value.+ perror("Temporary probe socket send() failed\n"); + goto out; + } + len = recvmsg(s_recv, &msg, MSG_PEEK); + if (len <= 0 && errno != EFAULT) { + perror("Temporary probe socket recvmsg() failed\n"); + goto out; + } + printf("MSG_PEEK with offset %ssupported\n", len == 1 ? "" : "not ");Better to use the info() or debug() helper here.+ *rc = len == 1;Better to explicitly check if you got an EFAULT or not here, rather than relying on the subtly different return semantics which as noted above I don't think are a good idea.+ err = EXIT_SUCCESS;+out: + close(s_recv); + close(s[1]); + close(s[0]); + return err; +} + +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support + */ +static bool tcp_probe_msg_peek_offset_cap()I believe we prefer the explicit foo(void) for declarations of functions with no parameters, rather than just foo().+{ + char ns_fn_stack[NS_FN_STACK_SIZE]; + int child_pid, child_ret, rc = 0; + + child_pid = do_clone(tcp_probe_sockets, ns_fn_stack, sizeof(ns_fn_stack), + CLONE_NEWNET | CLONE_NEWUSER | CLONE_VM | CLONE_VFORK | CLONE_FILES | SIGCHLD, + (void *)&rc); + if (child_pid <= 0) { + perror("Failed to clone tcp probe process\n"); + exit(EXIT_FAILURE); + } + waitpid(child_pid, &child_ret, 0); + return rc; +}-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Not a full review, but a couple of comments, mostly about stuff I also had in pkt_selfie.c (review of v1): On Thu, 18 Jan 2024 14:05:38 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote:There are two advantages of bind() without port, and then getsockname(): first, ip_unprivileged_port_start might have whatever value in our new namespace (we don't touch it), and I wouldn't take for granted we'll have CAP_SYS_ADMIN in it for all the possible start-up combinations. Second, there's no need for a magic value.[...] + + s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (s[0] < 0 || s[1] < 0) { + perror("Temporary probe socket creation failed\n"); + goto out; + } + if (0 > bind(s[0], &a, sizeof(a))) {Since the socket address is unspecified, why do you need to bind at all? It might be clearer to explicitly set a to localhost + a specific port - because you're in a temporary namespace, you can rely on every port being available.Hmm, why? From getsockname(2): getsockname() returns the current address to which the socket sockfd is bound [...]+ perror("Temporary probe socket bind() failed\n"); + goto out; + } + if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) { + perror("Temporary probe socket getsockname() failed\n"); + goto out; + } + if (0 > listen(s[0], 0)) { + perror("Temporary probe socket listen() failed\n"); + goto out; + } + if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) { + perror("Temporary probe socket connect() failed\n"); + goto out; + }This is assuming that a will now contain the correct address to connect to. Although it will have the right port, I think the address may still be unspecified for the listening socket.Right, because foo() isn't a prototype, while foo(void) is. Perhaps at some point we should enable -Wstrict-prototypes in CFLAGS (it's not in -Wextra, I just realised). Look: $ cat prototypes.c int a(void) { ; } int b() { ; } int main(char **argv) { a(); a(1); b(); b(1); } $ gcc prototypes.c prototypes.c: In function ‘main’: prototypes.c:3:30: error: too many arguments to function ‘a’ 3 | int main(char **argv) { a(); a(1); b(); b(1); } | ^ prototypes.c:1:5: note: declared here 1 | int a(void) { ; } | ^ note that calling b() with any number and type of arguments is fine. And: $ gcc -Wstrict-prototypes -Werror -Wfatal-errors prototypes.c prototypes.c:2:5: error: function declaration isn’t a prototype [-Werror=strict-prototypes] 2 | int b() { ; } | ^ compilation terminated due to -Wfatal-errors. cc1: all warnings being treated as errors[...] +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support + */ +static bool tcp_probe_msg_peek_offset_cap()I believe we prefer the explicit foo(void) for declarations of functions with no parameters, rather than just foo().> [...]-- Stefano
On Thu, Jan 18, 2024 at 05:23:26PM +0100, Stefano Brivio wrote:Not a full review, but a couple of comments, mostly about stuff I also had in pkt_selfie.c (review of v1): On Thu, 18 Jan 2024 14:05:38 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Good point. Note that at present we're not bind()ing to an address either.On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote:There are two advantages of bind() without port, and then getsockname(): first, ip_unprivileged_port_start might have whatever value in our new namespace (we don't touch it), and I wouldn't take for granted we'll have CAP_SYS_ADMIN in it for all the possible start-up combinations. Second, there's no need for a magic value.[...] + + s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (s[0] < 0 || s[1] < 0) { + perror("Temporary probe socket creation failed\n"); + goto out; + } + if (0 > bind(s[0], &a, sizeof(a))) {Since the socket address is unspecified, why do you need to bind at all? It might be clearer to explicitly set a to localhost + a specific port - because you're in a temporary namespace, you can rely on every port being available.But we've only bound ourselves to 0.0.0.0, which while perfectly cromulent for a listening socket, is no good for connect(). If we accept(), then the accepted socket will have a specific bound address, but the listening socket won't (I'm pretty sure I've actually tested this).Hmm, why? From getsockname(2): getsockname() returns the current address to which the socket sockfd is bound [...]+ perror("Temporary probe socket bind() failed\n"); + goto out; + } + if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) { + perror("Temporary probe socket getsockname() failed\n"); + goto out; + } + if (0 > listen(s[0], 0)) { + perror("Temporary probe socket listen() failed\n"); + goto out; + } + if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) { + perror("Temporary probe socket connect() failed\n"); + goto out; + }This is assuming that a will now contain the correct address to connect to. Although it will have the right port, I think the address may still be unspecified for the listening socket.-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibsonRight, because foo() isn't a prototype, while foo(void) is. Perhaps at some point we should enable -Wstrict-prototypes in CFLAGS (it's not in -Wextra, I just realised). Look: $ cat prototypes.c int a(void) { ; } int b() { ; } int main(char **argv) { a(); a(1); b(); b(1); } $ gcc prototypes.c prototypes.c: In function ‘main’: prototypes.c:3:30: error: too many arguments to function ‘a’ 3 | int main(char **argv) { a(); a(1); b(); b(1); } | ^ prototypes.c:1:5: note: declared here 1 | int a(void) { ; } | ^ note that calling b() with any number and type of arguments is fine. And: $ gcc -Wstrict-prototypes -Werror -Wfatal-errors prototypes.c prototypes.c:2:5: error: function declaration isn’t a prototype [-Werror=strict-prototypes] 2 | int b() { ; } | ^ compilation terminated due to -Wfatal-errors. cc1: all warnings being treated as errors[...] +/** tcp_probe_msg_peek_offset_cap() - Probe kernel for MSG_PEEK with offset support + */ +static bool tcp_probe_msg_peek_offset_cap()I believe we prefer the explicit foo(void) for declarations of functions with no parameters, rather than just foo().> [...]
On Fri, 19 Jan 2024 11:05:02 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Thu, Jan 18, 2024 at 05:23:26PM +0100, Stefano Brivio wrote:Hah, "cromulent" just embiggened my dictionary! Why not, though? From RFC 6890, 2.2.2: +----------------------+----------------------------+ | Attribute | Value | +----------------------+----------------------------+ | Address Block | 0.0.0.0/8 | | Name | "This host on this network"| and: $ strace -e connect ./pkt_selfie --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=891325, si_uid=1000, si_status=0, si_utime=0, si_stime=0} --- connect(5, {sa_family=AF_INET, sin_port=htons(51155), sin_addr=inet_addr("0.0.0.0")}, 16) = -1 EINPROGRESS (Operation now in progress) MSG_PEEK with offset not supported +++ exited with 0 +++ with pkt_selfie.c from review of v1: https://archives.passt.top/passt-dev/20231206160808.3d312733@elisabeth/ -- StefanoNot a full review, but a couple of comments, mostly about stuff I also had in pkt_selfie.c (review of v1): On Thu, 18 Jan 2024 14:05:38 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Good point. Note that at present we're not bind()ing to an address either.On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote:There are two advantages of bind() without port, and then getsockname(): first, ip_unprivileged_port_start might have whatever value in our new namespace (we don't touch it), and I wouldn't take for granted we'll have CAP_SYS_ADMIN in it for all the possible start-up combinations. Second, there's no need for a magic value.[...] + + s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (s[0] < 0 || s[1] < 0) { + perror("Temporary probe socket creation failed\n"); + goto out; + } + if (0 > bind(s[0], &a, sizeof(a))) {Since the socket address is unspecified, why do you need to bind at all? It might be clearer to explicitly set a to localhost + a specific port - because you're in a temporary namespace, you can rely on every port being available.But we've only bound ourselves to 0.0.0.0, which while perfectly cromulent for a listening socket, is no good for connect().Hmm, why? From getsockname(2): getsockname() returns the current address to which the socket sockfd is bound [...]+ perror("Temporary probe socket bind() failed\n"); + goto out; + } + if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) { + perror("Temporary probe socket getsockname() failed\n"); + goto out; + } + if (0 > listen(s[0], 0)) { + perror("Temporary probe socket listen() failed\n"); + goto out; + } + if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) { + perror("Temporary probe socket connect() failed\n"); + goto out; + }This is assuming that a will now contain the correct address to connect to. Although it will have the right port, I think the address may still be unspecified for the listening socket.
On Fri, Jan 19, 2024 at 11:45:05AM +0100, Stefano Brivio wrote:On Fri, 19 Jan 2024 11:05:02 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Huh. I never realised you could use 0.0.0.0 like that. I guess that makes it work, though I still feel like explicitly using localhost would be clearer.On Thu, Jan 18, 2024 at 05:23:26PM +0100, Stefano Brivio wrote:Hah, "cromulent" just embiggened my dictionary! Why not, though? From RFC 6890, 2.2.2: +----------------------+----------------------------+ | Attribute | Value | +----------------------+----------------------------+ | Address Block | 0.0.0.0/8 | | Name | "This host on this network"|Not a full review, but a couple of comments, mostly about stuff I also had in pkt_selfie.c (review of v1): On Thu, 18 Jan 2024 14:05:38 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Good point. Note that at present we're not bind()ing to an address either.On Sun, Jan 14, 2024 at 01:07:55PM -0500, Jon Maloy wrote: > > [...] > > + > + s[0] = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); > + s[1] = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); > + if (s[0] < 0 || s[1] < 0) { > + perror("Temporary probe socket creation failed\n"); > + goto out; > + } > + if (0 > bind(s[0], &a, sizeof(a))) { Since the socket address is unspecified, why do you need to bind at all? It might be clearer to explicitly set a to localhost + a specific port - because you're in a temporary namespace, you can rely on every port being available.There are two advantages of bind() without port, and then getsockname(): first, ip_unprivileged_port_start might have whatever value in our new namespace (we don't touch it), and I wouldn't take for granted we'll have CAP_SYS_ADMIN in it for all the possible start-up combinations. Second, there's no need for a magic value.But we've only bound ourselves to 0.0.0.0, which while perfectly cromulent for a listening socket, is no good for connect().> + perror("Temporary probe socket bind() failed\n"); > + goto out; > + } > + if (0 > getsockname(s[0], &a, &((socklen_t) { sizeof(a) }))) { > + perror("Temporary probe socket getsockname() failed\n"); > + goto out; > + } > + if (0 > listen(s[0], 0)) { > + perror("Temporary probe socket listen() failed\n"); > + goto out; > + } > + if (0 <= connect(s[1], &a, sizeof(a)) || errno != EINPROGRESS) { > + perror("Temporary probe socket connect() failed\n"); > + goto out; > + } This is assuming that a will now contain the correct address to connect to. Although it will have the right port, I think the address may still be unspecified for the listening socket.Hmm, why? From getsockname(2): getsockname() returns the current address to which the socket sockfd is bound [...]and: $ strace -e connect ./pkt_selfie --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=891325, si_uid=1000, si_status=0, si_utime=0, si_stime=0} --- connect(5, {sa_family=AF_INET, sin_port=htons(51155), sin_addr=inet_addr("0.0.0.0")}, 16) = -1 EINPROGRESS (Operation now in progress) MSG_PEEK with offset not supported +++ exited with 0 +++ with pkt_selfie.c from review of v1: https://archives.passt.top/passt-dev/20231206160808.3d312733@elisabeth/-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson