Stefano Brivio (24): conf, util, tap: Implement --trace option for extra verbose logging pcap: Fix mistake in printed string util: Drop CHECK_SET_MIN_MAX{,_PROTO_FD} macros util: Use standard int types tcp: Refactor to use events instead of states, split out spliced implementation test/lib/video: Fill in href attributes of video shortcuts udp: Drop _splice from recv, send, sendto static buffer names udp: Split buffer queueing/writing parts of udp_sock_handler() dhcpv6, tap, tcp: Use IN6_ARE_ADDR_EQUAL instead of open-coded memcmp() udp: Use flags for local, loopback, and configured unicast binds Makefile: Enable a few hardening flags test: Add asciinema(1) as requirement for CI in README test, seccomp, Makefile: Switch to valgrind runs for passt functional tests tcp, udp, util: Enforce 24-bit limit on socket numbers tcp: Rework timers to use timerfd instead of periodic bitmap scan tcp_splice: Close sockets right away on high number of open files test/perf: Work-around for virtio_net hang before long streams from guest README: Avoid "here" links README: Update Interfaces and Availability sections tcp: Fit struct tcp_conn into a single 64-byte cacheline dhcp: Minimum option length implied by RFC 951 is 60 bytes, not 62 tcp, tcp_splice: Use less awkward syntax to swap in/out sockets from pools util: Fix function declaration style of write_pidfile() treewide: Packet abstraction with mandatory boundary checks Makefile | 18 +- README.md | 58 +- arp.c | 51 +- arp.h | 2 +- conf.c | 15 + dhcp.c | 62 +- dhcp.h | 2 +- dhcpv6.c | 151 +- dhcpv6.h | 3 +- icmp.c | 28 +- icmp.h | 4 +- ndp.c | 59 +- ndp.h | 3 +- packet.c | 134 ++ packet.h | 77 + passt.1 | 5 + passt.c | 13 +- passt.h | 9 +- pcap.c | 2 +- seccomp.sh | 1 + tap.c | 331 ++-- tcp.c | 3023 +++++++++++++++---------------------- tcp.h | 27 +- tcp_splice.c | 888 +++++++++++ tcp_splice.h | 15 + test/README.md | 10 +- test/lib/setup | 52 +- test/lib/video | 2 +- test/perf/passt_tcp | 15 + test/perf/passt_udp | 15 + test/run | 9 + test/valgrind.supp | 9 + test/valgrind/passt | 22 + test/valgrind/passt_in_ns | 22 + udp.c | 501 +++--- udp.h | 6 +- util.c | 95 +- util.h | 43 +- 38 files changed, 3313 insertions(+), 2469 deletions(-) create mode 100644 packet.c create mode 100644 packet.h create mode 100644 tcp_splice.c create mode 100644 tcp_splice.h create mode 100644 test/valgrind.supp create mode 100644 test/valgrind/passt create mode 100644 test/valgrind/passt_in_ns -- 2.35.1
--debug can be a bit too noisy, especially as single packets or socket messages are logged: implement a new option, --trace, implying --debug, that enables all debug messages. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- conf.c | 15 +++++++++++++++ passt.1 | 5 +++++ passt.c | 3 ++- passt.h | 2 ++ tap.c | 4 ++-- test/README.md | 6 +++++- test/lib/setup | 10 ++++++++++ util.c | 6 ++++++ util.h | 8 ++++++++ 9 files changed, 55 insertions(+), 4 deletions(-) diff --git a/conf.c b/conf.c index 08f24be..5170163 100644 --- a/conf.c +++ b/conf.c @@ -558,6 +558,7 @@ static void usage(const char *name) info(""); info( " -d, --debug Be verbose, don't run in background"); + info( " --trace Be extra verbose, implies --debug"); info( " -q, --quiet Don't print informational messages"); info( " -f, --foreground Don't run in background"); info( " default: run in background if started from a TTY"); @@ -829,6 +830,7 @@ void conf(struct ctx *c, int argc, char **argv) {"no-dhcp-search", no_argument, NULL, 8 }, {"dns-forward", required_argument, NULL, 9 }, {"no-netns-quit", no_argument, NULL, 10 }, + {"trace", no_argument, NULL, 11 }, { 0 }, }; struct get_bound_ports_ns_arg ns_ports_arg = { .c = c }; @@ -960,6 +962,19 @@ void conf(struct ctx *c, int argc, char **argv) } c->no_netns_quit = 1; break; + case 11: + if (c->trace) { + err("Multiple --trace options given"); + usage(argv[0]); + } + + if (c->quiet) { + err("Either --trace or --quiet"); + usage(argv[0]); + } + + c->trace = c->debug = c->foreground = 1; + break; case 'd': if (c->debug) { err("Multiple --debug options given"); diff --git a/passt.1 b/passt.1 index 57cf745..0252fbb 100644 --- a/passt.1 +++ b/passt.1 @@ -74,6 +74,11 @@ for performance reasons. .BR \-d ", " \-\-debug Be verbose, don't run in background. +.TP +.BR \-\-trace +Be extra verbose, show single packets, don't run in background. Implies +\fB--debug\fR. + .TP .BR \-q ", " \-\-quiet Don't print informational messages. diff --git a/passt.c b/passt.c index 40d3e57..5cd8f3b 100644 --- a/passt.c +++ b/passt.c @@ -96,7 +96,7 @@ char *ip_proto_str[IPPROTO_SCTP + 1] = { static void sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now) { - debug("%s: %s packet from socket %i (events: 0x%08x)", + trace("%s: %s packet from socket %i (events: 0x%08x)", c->mode == MODE_PASST ? "passt" : "pasta", IP_PROTO_STR(ref.r.proto), ref.r.s, events); @@ -351,6 +351,7 @@ int main(int argc, char **argv) __setlogmask(LOG_MASK(LOG_EMERG)); conf(&c, argc, argv); + trace_init(c.trace); if (!c.debug && (c.stderr || isatty(fileno(stdout)))) __openlog(log_name, LOG_PERROR, LOG_DAEMON); diff --git a/passt.h b/passt.h index 042f760..8344fca 100644 --- a/passt.h +++ b/passt.h @@ -92,6 +92,7 @@ enum passt_modes { * struct ctx - Execution context * @mode: Operation mode, qemu/UNIX domain socket or namespace/tap * @debug: Enable debug mode + * @trace: Enable tracing (extra debug) mode * @quiet: Don't print informational messages * @foreground: Run in foreground, don't log to stderr by default * @stderr: Force logging to stderr @@ -153,6 +154,7 @@ enum passt_modes { struct ctx { enum passt_modes mode; int debug; + int trace; int quiet; int foreground; int stderr; diff --git a/tap.c b/tap.c index 29fcd51..e1854fb 100644 --- a/tap.c +++ b/tap.c @@ -283,14 +283,14 @@ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h, } if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) { - debug("protocol %i from tap: %s:%i -> %s:%i (%i packet%s)", + trace("protocol %i from tap: %s:%i -> %s:%i (%i packet%s)", proto, seq4 ? buf4s : buf6s, ntohs(seq4 ? seq4->source : seq6->source), seq4 ? buf4d : buf6d, ntohs(seq4 ? seq4->dest : seq6->dest), count, count == 1 ? "" : "s"); } else { - debug("protocol %i from tap: %s -> %s (%i packet%s)", + trace("protocol %i from tap: %s -> %s (%i packet%s)", proto, iph ? buf4s : buf6s, iph ? buf4d : buf6d, count, count == 1 ? "" : "s"); } diff --git a/test/README.md b/test/README.md index 88270e8..cdf233b 100644 --- a/test/README.md +++ b/test/README.md @@ -63,7 +63,11 @@ Just issue: ./run -from the `test` directory. Elevated privileges are not needed. +from the `test` directory. Elevated privileges are not needed. Environment +variable settings: DEBUG=1 enables debugging messages, TRACE=1 enables tracing +(further debugging messages), PCAP=1 enables packet captures. Example: + + PCAP=1 TRACE=1 ./run ## Continuous integration diff --git a/test/lib/setup b/test/lib/setup index f04949e..a39eb80 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -49,6 +49,8 @@ setup_passt() { __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/passt.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + pane_run PASST "./passt ${__opts} -f -t 10001 -u 10001" sleep 1 @@ -90,6 +92,7 @@ setup_pasta() { __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" pane_run PASST "./pasta ${__opts} -f -t 10002 -T 10003 -u 10002 -U 10003 ${__target_pid}" sleep 1 @@ -118,6 +121,7 @@ setup_passt_in_ns() { __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta_with_passt.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" __pid_file="$(mktemp)" pane_run PASST "./pasta ${__opts} -t 10001,10002,10011,10012 -T 10003,10013 -u 10001,10002,10011,10012 -U 10003,10013 -P ${__pid_file}" @@ -145,6 +149,7 @@ setup_passt_in_ns() { __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/passt_in_pasta.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" #pane_run PASST "valgrind --max-stackframe=3000000 ./passt -f ${__opts} -t 10001,10011,10021,10031 -u 10001,10011,10021,10031" pane_run PASST "./passt -f ${__opts} -t 10001,10011,10021,10031 -u 10001,10011,10021,10031" @@ -183,11 +188,13 @@ setup_two_guests() { __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta_1.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" pane_run PASST_1 "./pasta ${__opts} -P ${__pid1_file} -t 10001,10002 -T 10003,10004 -u 10001,10002 -U 10003,10004" __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/pasta_2.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" pane_run PASST_2 "./pasta ${__opts} -P ${__pid2_file} -t 10004,10005 -T 10003,10001 -u 10004,10005 -U 10003,10001" sleep 1 @@ -223,12 +230,15 @@ setup_two_guests() { __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/passt_1.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" + pane_run PASST_1 "./passt -f ${__opts} -t 10001 -u 10001" sleep 1 __opts= [ ${PCAP} -eq 1 ] && __opts="${__opts} -p /tmp/passt_2.pcap" [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" + [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" pane_run PASST_2 "./passt -f ${__opts} -t 10004 -u 10004" pane_run GUEST_2 'cp mbuto.img mbuto_2.img' diff --git a/util.c b/util.c index 90b5ab8..50b83db 100644 --- a/util.c +++ b/util.c @@ -45,6 +45,7 @@ static int log_sock = -1; static char log_ident[BUFSIZ]; static int log_opt; static time_t log_debug_start; +int log_trace; #define logfn(name, level) \ void name(const char *format, ...) { \ @@ -77,6 +78,11 @@ logfn(warn, LOG_WARNING) logfn(info, LOG_INFO) logfn(debug, LOG_DEBUG) +void trace_init(int enable) +{ + log_trace = enable; +} + /** * __openlog() - Non-optional openlog() wrapper, to allow custom vsyslog() * @ident: openlog() identity (program name) diff --git a/util.h b/util.h index b7852e9..bfab221 100644 --- a/util.h +++ b/util.h @@ -8,6 +8,14 @@ void warn(const char *format, ...); void info(const char *format, ...); void debug(const char *format, ...); +extern int log_trace; +void trace_init(int enable); +#define trace(format, ...) \ + do { \ + if (log_trace) \ + debug(format, ##__VA_ARGS__); \ + } while (0) + #ifndef SECCOMP_RET_KILL_PROCESS #define SECCOMP_RET_KILL_PROCESS SECCOMP_RET_KILL #endif -- 2.35.1
Packets are saved *to* a file, not *at* it. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- pcap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pcap.c b/pcap.c index 9c617ce..a95a5ee 100644 --- a/pcap.c +++ b/pcap.c @@ -207,7 +207,7 @@ void pcap_init(struct ctx *c) return; } - info("Saving packet capture at %s", c->pcap); + info("Saving packet capture to %s", c->pcap); if (write(pcap_fd, &pcap_hdr, sizeof(pcap_hdr)) < 0) warn("Cannot write PCAP header: %s", strerror(errno)); -- 2.35.1
...those were used when epoll references used to be socket numbers, they should have gone away a long time ago. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- util.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/util.h b/util.h index bfab221..d80157c 100644 --- a/util.h +++ b/util.h @@ -26,20 +26,6 @@ void trace_init(int enable); #define ETH_MIN_MTU 68 #endif -#define CHECK_SET_MIN_MAX(basename, fd) \ - do { \ - if ((fd) < basename##min) \ - basename##min = (fd); \ - if ((fd) > basename##max) \ - basename##max = (fd); \ - } while (0) - -#define CHECK_SET_MIN_MAX_PROTO_FD(proto, ipproto, proto_ctx, fd) \ - do { \ - if ((proto) == (ipproto)) \ - CHECK_SET_MIN_MAX(c->proto_ctx.fd_, (fd)); \ - } while (0) - #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #endif -- 2.35.1
...instead of kernel-like short notations. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- util.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/util.h b/util.h index d80157c..e314c71 100644 --- a/util.h +++ b/util.h @@ -164,7 +164,7 @@ struct ctx; struct ipv6hdr { #pragma GCC diagnostic ignored "-Wpedantic" #if __BYTE_ORDER == __BIG_ENDIAN - __u8 version:4, + uint8_t version:4, priority:4; #else uint8_t priority:4, @@ -173,17 +173,17 @@ struct ipv6hdr { #pragma GCC diagnostic pop uint8_t flow_lbl[3]; - __be16 payload_len; - __u8 nexthdr; - __u8 hop_limit; + uint16_t payload_len; + uint8_t nexthdr; + uint8_t hop_limit; struct in6_addr saddr; struct in6_addr daddr; }; struct ipv6_opt_hdr { - __u8 nexthdr; - __u8 hdrlen; + uint8_t nexthdr; + uint8_t hdrlen; /* * TLV encoded option data follows. */ -- 2.35.1
Using events and flags instead of states makes the implementation much more straightforward: actions are mostly centered on events that occurred on the connection rather than states. An example is given by the ESTABLISHED_SOCK_FIN_SENT and FIN_WAIT_1_SOCK_FIN abominations: we don't actually care about which side started closing the connection to handle closing of connection halves. Split out the spliced implementation, as it has very little in common with the "regular" TCP path. Refactor things here and there to improve clarity. Add helpers to trace where resets and flag settings come from. No functional changes intended. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- passt.c | 4 +- tcp.c | 2384 ++++++++++++++++++-------------------------------- tcp.h | 10 +- tcp_splice.c | 859 ++++++++++++++++++ tcp_splice.h | 14 + util.c | 19 + util.h | 4 +- 7 files changed, 1773 insertions(+), 1521 deletions(-) create mode 100644 tcp_splice.c create mode 100644 tcp_splice.h diff --git a/passt.c b/passt.c index 5cd8f3b..6c04266 100644 --- a/passt.c +++ b/passt.c @@ -119,12 +119,12 @@ static void post_handler(struct ctx *c, struct timespec *now) #define CALL_PROTO_HANDLER(c, now, lc, uc) \ do { \ extern void \ - lc ## _defer_handler (struct ctx *c) \ + lc ## _defer_handler (struct ctx *, struct timespec *) \ __attribute__ ((weak)); \ \ if (!c->no_ ## lc) { \ if (lc ## _defer_handler) \ - lc ## _defer_handler(c); \ + lc ## _defer_handler(c, now); \ \ if (timespec_diff_ms((now), &c->lc.timer_run) \= uc ## _TIMER_INTERVAL) { \diff --git a/tcp.c b/tcp.c index 4dc9750..968db97 100644 --- a/tcp.c +++ b/tcp.c @@ -8,7 +8,7 @@ * * tcp.c - TCP L2-L4 translation state machine * - * Copyright (c) 2020-2021 Red Hat GmbH + * Copyright (c) 2020-2022 Red Hat GmbH * Author: Stefano Brivio <sbrivio(a)redhat.com> */ @@ -52,7 +52,7 @@ * delegated as much as possible to the TCP implementations of guest and host * kernel. This is achieved by: * - avoiding a complete TCP stack reimplementation, with a modified TCP state - * machine focused on the translation of observed states instead + * machine focused on the translation of observed events instead * - mirroring TCP dynamics as described above and hence avoiding the need for * segmentation, explicit queueing, and reassembly of segments * - security: @@ -98,14 +98,14 @@ * Connection tracking and storage * ------------------------------- * - * Connections are tracked by the @tt array of struct tcp_tap_conn, containing + * Connections are tracked by the @tc array of struct tcp_conn, containing * addresses, ports, TCP states and parameters. This is statically allocated and * indexed by an arbitrary connection number. The array is compacted whenever a * connection is closed, by remapping the highest connection index in use to the * one freed up. * * References used for the epoll interface report the connection index used for - * the @tt array. + * the @tc array. * * IPv4 addresses are stored as IPv4-mapped IPv6 addresses to avoid the need for * separate data structures depending on the protocol version. @@ -127,64 +127,40 @@ * added to the epoll list, with no separate storage. * * - * States and events + * Events and states * ----------------- * - * These states apply to connected sockets only, listening sockets are always - * open after initialisation, in LISTEN state. A single state is maintained for - * both sides of the connection, and some states are omitted as they are already - * handled by host kernel and guest. - * - * - CLOSED no connection - * No associated events: this is always a final state, new connections - * directly start from TAP_SYN_SENT or SOCK_SYN_SENT described below. - * - * - TAP_SYN_SENT connect() in progress, triggered from tap - * - connect() completes SYN,ACK to tap > TAP_SYN_RCVD - * - connect() aborts RST to tap, close socket > CLOSED - * - * - SOCK_SYN_SENT new connected socket, SYN sent to tap - * - SYN,ACK from tap ACK to tap > ESTABLISHED - * - SYN,ACK timeout RST to tap, close socket > CLOSED - * - * - TAP_SYN_RCVD connect() completed, SYN,ACK sent to tap - * - FIN from tap write shutdown > FIN_WAIT_1 - * - ACK from tap > ESTABLISHED - * - ACK timeout RST to tap, close socket > CLOSED - * - * - ESTABLISHED connection established, ready for data - * - EPOLLRDHUP read shutdown > ESTABLISHED_SOCK_FIN - * - FIN from tap write shutdown > FIN_WAIT_1 - * - EPOLLHUP RST to tap, close socket > CLOSED - * - data timeout read shutdown, FIN to tap > - * ESTABLISHED_SOCK_FIN_SENT - * - * - ESTABLISHED_SOCK_FIN socket closing connection, reading half closed - * - zero-sized socket read FIN,ACK to tap > ESTABLISHED_SOCK_FIN_SENT - * - * - ESTABLISHED_SOCK_FIN_SENT socket closing connection, FIN sent to tap - * - ACK (for FIN) from tap > CLOSE_WAIT - * - tap ACK timeout RST to tap, close socket > CLOSED - * - * - CLOSE_WAIT socket closing connection, ACK from tap - * - FIN from tap write shutdown > LAST_ACK - * - data timeout RST to tap, close socket > CLOSED - * - * - LAST_ACK socket started close, tap completed it - * - any event from socket ACK to tap, close socket > CLOSED - * - ACK timeout RST to tap, close socket > CLOSED + * Instead of tracking connection states using a state machine, connection + * events are used to determine state and actions for a given connection. This + * makes the implementation simpler as most of the relevant tasks deal with + * reactions to events, rather than state-associated actions. For user + * convenience, approximate states are mapped in logs from events by + * @tcp_state_str. + * + * The events are: + * + * - SOCK_ACCEPTED connection accepted from socket, SYN sent to tap/guest + * + * - TAP_SYN_RCVD tap/guest initiated connection, SYN received + * + * - TAP_SYN_ACK_SENT SYN, ACK sent to tap/guest, valid for TAP_SYN_RCVD only + * + * - ESTABLISHED connection established, the following events are valid: + * + * - SOCK_FIN_RCVD FIN (EPOLLRDHUP) received from socket + * + * - SOCK_FIN_SENT FIN (write shutdown) sent to socket * - * - FIN_WAIT_1 tap closing connection, FIN sent to socket - * - EPOLLRDHUP FIN,ACK to tap, shutdown > FIN_WAIT_1_SOCK_FIN - * - socket timeout RST to tap, close socket > CLOSED + * - TAP_FIN_RCVD FIN received from tap/guest * - * - FIN_WAIT_1_SOCK_FIN tap closing connection, FIN received from socket - * - ACK from tap close socket > CLOSED - * - tap ACK timeout RST to tap, close socket > CLOSED + * - TAP_FIN_SENT FIN sent to tap/guest * - * - from any state - * - RST from tap close socket > CLOSED - * - socket error RST to tap, close socket > CLOSED + * - TAP_FIN_ACKED ACK to FIN seen from tap/guest + * + * Setting any event in CONN_STATE_BITS (SOCK_ACCEPTED, TAP_SYN_RCVD, + * ESTABLISHED) clears all the other events, as those represent the fundamental + * connection states. No events (events == CLOSED) means the connection is + * closed. * * Connection setup * ---------------- @@ -201,76 +177,75 @@ * Aging and timeout * ----------------- * - * A bitmap of TCP_MAX_CONNS bits indicate the connections subject to timed - * events based on states: - * - SOCK_SYN_SENT: after a 2MSL (240s) timeout waiting for a SYN,ACK segment - * from tap expires, connection is reset (RST to tap, socket closed) - * - TAP_SYN_RCVD: after a 2MSL (240s) timeout waiting for an ACK segment from - * tap expires, connection is reset (RST to tap, socket closed) - * - TAP_SYN_SENT: connect() is pending, timeout is handled implicitly by - * connect() timeout, connection will be reset in case - * - ESTABLISHED, ESTABLISHED_SOCK_FIN: if an ACK segment to tap is pending, - * bytes acknowledged by socket endpoint are checked every 50ms (one quarter - * of current TCP_DELACK_MAX on Linux) - * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a timeout of 3s (TODO: implement - * requirements from RFC 6298) waiting for an ACK segment from tap expires, - * data from socket queue is retransmitted starting from the last ACK sequence - * - ESTABLISHED, ESTABLISHED_SOCK_FIN: after a two hours (current - * TCP_KEEPALIVE_TIME on Linux) timeout waiting for any activity expires, - * connection is reset (RST to tap, socket closed) - * - ESTABLISHED_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK - * segment from tap expires, connection is reset (RST to tap, socket closed) - * - CLOSE_WAIT: after a 2MSL (240s) timeout waiting for a FIN segment from tap - * expires, connection is reset (RST to tap, socket closed) - * - FIN_WAIT_1: after a 2MSL (240s) timeout waiting for an ACK segment from - * socet expires, connection is reset (RST to tap, socket closed) - * - FIN_WAIT_1_SOCK_FIN: after a 2MSL (240s) timeout waiting for an ACK segment - * from tap expires, connection is reset (RST to tap, socket closed) - * - LAST_ACK: after a 2MSL (240s) timeout waiting for an ACK segment from - * socket expires, connection is reset (RST to tap, socket closed) - * - * - * Data flows (from ESTABLISHED, ESTABLISHED_SOCK_FIN states) - * ---------------------------------------------------------- - * - * @seq_to_tap: next sequence for packets to tap - * @seq_ack_from_tap: last ACK number received from tap - * @seq_from_tap: next sequence for packets from tap (not actually sent) - * @seq_ack_to_tap: last ACK number sent to tap - * - * @seq_init_from_tap: initial sequence number from tap + * Open connections are checked periodically against a number of timeouts. Those + * are: + * + * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake within + * this time, reset the connection + * + * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on + * either side, the connection is reset + * + * - ACK_INTERVAL, or zero-sized window advertised to tap/guest: forcibly check + * if an ACK segment can be sent + * + * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending + * data, re-send data from the socket and reset sequence to what was + * acknowledged. If this persists for longer than LAST_ACK_TIMEOUT, reset the + * connection + * + * - FIN_TIMEOUT, on TAP_FIN_SENT: if no ACK is received for the FIN segment + * within this time, the connection is reset + * + * - FIN_TIMEOUT, on SOCK_FIN_SENT: if no activity is detected on the socket + * after sending a FIN segment (write shutdown), reset the connection + * + * - LAST_ACK_TIMEOUT on SOCK_FIN_SENT *and* SOCK_FIN_RCVD: reset the connection + * if no activity was detected on any of the two sides after sending a FIN + * segment + * + * + * Summary of data flows (with ESTABLISHED event) + * ---------------------------------------------- + * + * @seq_to_tap: next sequence for packets to tap/guest + * @seq_ack_from_tap: last ACK number received from tap/guest + * @seq_from_tap: next sequence for packets from tap/guest (expected) + * @seq_ack_to_tap: last ACK number sent to tap/guest + * + * @seq_init_from_tap: initial sequence number from tap/guest + * @seq_init_to_tap: initial sequence number from tap/guest * * @wnd_from_tap: last window size received from tap, scaled - * - * - from socket to tap: + * @wnd_from_tap: last window size advertised from tap, scaled + * + * - from socket to tap/guest: * - on new data from socket: * - peek into buffer - * - send data to tap: + * - send data to tap/guest: * - starting at offset (@seq_to_tap - @seq_ack_from_tap) * - in MSS-sized segments * - increasing @seq_to_tap at each segment * - up to window (until @seq_to_tap - @seq_ack_from_tap <= @wnd_from_tap) - * - mark socket in bitmap for periodic ACK check, set @last_ts_to_tap - * - on read error, send RST to tap, close socket - * - on zero read, send FIN to tap, enter ESTABLISHED_SOCK_FIN - * - on ACK from tap: - * - set @ts_ack_tap + * - on read error, send RST to tap/guest, close socket + * - on zero read, send FIN to tap/guest, set TAP_FIN_SENT + * - on ACK from tap/guest: + * - set @ts_ack_from_tap * - check if it's the second duplicated ACK * - consume buffer by difference between new ack_seq and @seq_ack_from_tap * - update @seq_ack_from_tap from ack_seq in header * - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and * resend with steps listed above * - set TCP_WINDOW_CLAMP from TCP header from tap - * - on @seq_ack_from_tap == @seq_to_tap, mark in bitmap, umark otherwise * - periodically: * - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer * (TODO: implement requirements from RFC 6298, currently 3s fixed) from - * @ts_tap_from_ack elapsed, reset @seq_to_tap to @seq_ack_from_tap, and + * @ts_ack_from_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and * resend data with the steps listed above * - * - from tap to socket: - * - on packet from tap: - * - set @ts_tap_ack + * - from tap/guest to socket: + * - on packet from tap/guest: + * - set @ts_tap_act * - set TCP_WINDOW_CLAMP from TCP header from tap * - check seq from header against @seq_from_tap, if data is missing, send * two ACKs with number @seq_ack_to_tap, discard packet @@ -279,7 +254,7 @@ * - in ESTABLISHED state, send ACK to tap as soon as we queue to the * socket. In other states, query socket for TCP_INFO, set * @seq_ack_to_tap to (tcpi_bytes_acked + @seq_init_from_tap) % 2^32 and - * send ACK to tap + * send ACK to tap/guest * * * PASTA mode @@ -291,20 +266,7 @@ * section. * * For local traffic directed to TCP ports configured for direct mapping between - * namespaces, the implementation is substantially simpler: packets are directly - * translated between L4 sockets using a pair of splice() syscalls. These - * connections are tracked in the @ts array of struct tcp_splice_conn, using - * these states: - * - * - CLOSED: no connection - * - SPLICE_ACCEPTED: accept() on the listening socket succeeded - * - SPLICE_CONNECT: connect() issued in the destination namespace - * - SPLICE_ESTABLISHED: connect() succeeded, packets are transferred - * - SPLICE_FIN_FROM: FIN (EPOLLRDHUP) seen from originating socket - * - SPLICE_FIN_TO: FIN (EPOLLRDHUP) seen from connected socket - * - SPLICE_FIN_BOTH: FIN (EPOLLRDHUP) seen from both sides - * - * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64 + * namespaces, see the implementation in tcp_splice.c. */ #include <sched.h> @@ -339,15 +301,13 @@ #include "siphash.h" #include "pcap.h" #include "conf.h" +#include "tcp_splice.h" #define MAX_TAP_CONNS (128 * 1024) -#define MAX_SPLICE_CONNS (128 * 1024) -#define TCP_TAP_FRAMES_MEM 256 -#define TCP_TAP_FRAMES \ - (c->mode == MODE_PASST ? TCP_TAP_FRAMES_MEM : 1) - -#define MAX_PIPE_SIZE (2UL * 1024 * 1024) +#define TCP_FRAMES_MEM 256 +#define TCP_FRAMES \ + (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (MAX_TAP_CONNS * 100 / \ @@ -375,9 +335,7 @@ #define FIN_TIMEOUT 240000 #define LAST_ACK_TIMEOUT 240000 -#define TCP_SOCK_POOL_SIZE 32 #define TCP_SOCK_POOL_TSH 16 /* Refill in ns if > x used */ -#define TCP_SPLICE_PIPE_POOL_SIZE 16 #define REFILL_INTERVAL 1000 #define PORT_DETECT_INTERVAL 1000 @@ -395,45 +353,13 @@ #define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW) #define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW) -#define CONN_V4(conn) (IN6_IS_ADDR_V4MAPPED(&conn->a.a6)) -#define CONN_V6(conn) (!CONN_V4(conn)) - -enum tcp_state { - CLOSED = 0, - TAP_SYN_SENT, - SOCK_SYN_SENT, - TAP_SYN_RCVD, - ESTABLISHED, - ESTABLISHED_SOCK_FIN, - ESTABLISHED_SOCK_FIN_SENT, - CLOSE_WAIT, - LAST_ACK, - FIN_WAIT_1, - FIN_WAIT_1_SOCK_FIN, - SPLICE_ACCEPTED, - SPLICE_CONNECT, - SPLICE_ESTABLISHED, - SPLICE_FIN_FROM, - SPLICE_FIN_TO, - SPLICE_FIN_BOTH, -}; -#define TCP_STATE_STR_SIZE (SPLICE_FIN_BOTH + 1) - -static char *tcp_state_str[TCP_STATE_STR_SIZE] __attribute((__unused__)) = { - "CLOSED", "TAP_SYN_SENT", "SOCK_SYN_SENT", "TAP_SYN_RCVD", - "ESTABLISHED", "ESTABLISHED_SOCK_FIN", "ESTABLISHED_SOCK_FIN_SENT", - "CLOSE_WAIT", "LAST_ACK", "FIN_WAIT_1", "FIN_WAIT_1_SOCK_FIN", - "SPLICE_ACCEPTED", "SPLICE_CONNECT", "SPLICE_ESTABLISHED", - "SPLICE_FIN_FROM", "SPLICE_FIN_TO", "SPLICE_FIN_BOTH", -}; - #define FIN (1 << 0) #define SYN (1 << 1) #define RST (1 << 2) #define ACK (1 << 4) /* Flags for internal usage */ #define DUP_ACK (1 << 5) -#define FORCE_ACK (1 << 6) +#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */ #define OPT_EOL 0 #define OPT_NOP 1 @@ -445,10 +371,10 @@ static char *tcp_state_str[TCP_STATE_STR_SIZE] __attribute((__unused__)) = { #define OPT_SACK 5 #define OPT_TS 8 -struct tcp_tap_conn; +struct tcp_conn; /** - * struct tcp_tap_conn - Descriptor for a TCP connection via tap (not spliced) + * struct tcp_conn - Descriptor for a TCP connection (not spliced) * @next: Pointer to next item in hash chain, if any * @sock: Socket descriptor number * @hash_bucket: Bucket index in connection lookup hash table @@ -458,8 +384,9 @@ struct tcp_tap_conn; * @a.a4.a: IPv4 address * @tap_port: Guest-facing tap port * @sock_port: Remote, socket-facing port - * @local: Destination is local - * @state: TCP connection state + * @events: Connection events, implying connection states + * @flags: Connection flags representing internal attributes + * @tap_mss: Maximum segment size advertised by guest * @seq_to_tap: Next sequence for packets to tap * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) @@ -471,17 +398,15 @@ struct tcp_tap_conn; * @ws: Window scaling factor * @wnd_from_tap: Last window size received from tap, scaled * @wnd_to_tap: Socket-side sending window, advertised to tap - * @window_clamped: Window was clamped on socket at least once + * @snd_buf: Socket sending buffer reported by kernel, in bytes * @ts_sock_act: Last activity timestamp from socket for timeout purposes * @ts_tap_act: Last activity timestamp from tap for timeout purposes * @ts_ack_from_tap: Last ACK segment timestamp from tap * @ts_ack_to_tap: Last ACK segment timestamp to tap * @tap_data_noack: Last unacked data to tap, set to { 0, 0 } on ACK - * @mss_guest: Maximum segment size advertised by guest - * @events: epoll events currently enabled for socket */ -struct tcp_tap_conn { - struct tcp_tap_conn *next; +struct tcp_conn { + struct tcp_conn *next; int sock; int hash_bucket; @@ -493,10 +418,35 @@ struct tcp_tap_conn { struct in_addr a; } a4; } a; +#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6) +#define CONN_V6(conn) (!CONN_V4(conn)) + in_port_t tap_port; in_port_t sock_port; - int local; - enum tcp_state state; + + uint8_t events; +#define CLOSED 0 +#define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */ +#define TAP_SYN_RCVD BIT(1) /* implies socket connecting */ +#define TAP_SYN_ACK_SENT BIT( 3) /* implies socket connected */ +#define ESTABLISHED BIT(2) +#define SOCK_FIN_RCVD BIT( 3) +#define SOCK_FIN_SENT BIT( 4) +#define TAP_FIN_RCVD BIT( 5) +#define TAP_FIN_SENT BIT( 6) +#define TAP_FIN_ACKED BIT( 7) + +#define CONN_STATE_BITS /* Setting these clears other flags */ \ + (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) + + uint8_t flags; +#define CONN_STALLED BIT(0) +#define CONN_LOCAL BIT(1) +#define CONN_WND_CLAMPED BIT(2) +#define CONN_IN_EPOLL BIT(3) +#define CONN_ACTIVE_CLOSE BIT(4) + + uint16_t tap_mss; uint32_t seq_to_tap; uint32_t seq_ack_from_tap; @@ -508,9 +458,10 @@ struct tcp_tap_conn { uint16_t ws_tap; uint16_t ws; + uint32_t wnd_from_tap; uint32_t wnd_to_tap; - int window_clamped; + int snd_buf; struct timespec ts_sock_act; @@ -518,33 +469,35 @@ struct tcp_tap_conn { struct timespec ts_ack_from_tap; struct timespec ts_ack_to_tap; struct timespec tap_data_noack; +}; + +#define CONN_IS_CLOSED(conn) (conn->events == CLOSED) +#define CONN_IS_CLOSING(conn) \ + ((conn->events & ESTABLISHED) && \ + (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) +#define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) - unsigned int mss_guest; +#define CONN(index) (tc + (index)) - uint32_t events; +static const char *tcp_event_str[] __attribute((__unused__)) = { + "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", + + "SOCK_FIN_RCVD", "SOCK_FIN_SENT", "TAP_FIN_RCVD", "TAP_FIN_SENT", + "TAP_FIN_ACKED", }; -/** - * struct tcp_splice_conn - Descriptor for a spliced TCP connection - * @from: File descriptor number of socket for accepted connection - * @pipe_from_to: Pipe ends for splice() from @from to @to - * @to: File descriptor number of peer connected socket - * @pipe_to_from: Pipe ends for splice() from @to to @from - * @state: TCP connection state -*/ -struct tcp_splice_conn { - int from; - int pipe_from_to[2]; - int to; - int pipe_to_from[2]; - enum tcp_state state; - int from_fin_sent; - int to_fin_sent; - int v6; - uint64_t from_read; - uint64_t from_written; - uint64_t to_read; - uint64_t to_written; +static const char *tcp_state_str[] __attribute((__unused__)) = { + "SYN_RCVD", "SYN_SENT", "ESTABLISHED", + "SYN_RCVD", /* approximately maps to TAP_SYN_ACK_SENT */ + + /* Passive close: */ + "CLOSE_WAIT", "CLOSE_WAIT", "LAST_ACK", "LAST_ACK", "LAST_ACK", + /* Active close (+5): */ + "CLOSING", "FIN_WAIT_1", "FIN_WAIT_1", "FIN_WAIT_2", "TIME_WAIT", +}; + +static const char *tcp_flag_str[] __attribute((__unused__)) = { + "STALLED", "LOCAL", "WND_CLAMPED", "IN_EPOLL", "ACTIVE_CLOSE", }; /* Port re-mappings as delta, indexed by original destination port */ @@ -559,26 +512,6 @@ static int tcp_sock_ns [USHRT_MAX][IP_VERSIONS]; /* Table of destinations with very low RTT (assumed to be local), LRU */ static struct in6_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; -/** - * tcp_remap_to_tap() - Set delta for port translation toward guest/tap - * @port: Original destination port, host order - * @delta: Delta to be added to original destination port - */ -void tcp_remap_to_tap(in_port_t port, in_port_t delta) -{ - tcp_port_delta_to_tap[port] = delta; -} - -/** - * tcp_remap_to_tap() - Set delta for port translation toward init namespace - * @port: Original destination port, host order - * @delta: Delta to be added to original destination port - */ -void tcp_remap_to_init(in_port_t port, in_port_t delta) -{ - tcp_port_delta_to_init[port] = delta; -} - /* Static buffers */ /** @@ -611,7 +544,7 @@ static struct tcp4_l2_buf_t { #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif -tcp4_l2_buf[TCP_TAP_FRAMES_MEM]; +tcp4_l2_buf[TCP_FRAMES_MEM]; static unsigned int tcp4_l2_buf_used; static size_t tcp4_l2_buf_bytes; @@ -642,24 +575,24 @@ struct tcp6_l2_buf_t { #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif -tcp6_l2_buf[TCP_TAP_FRAMES_MEM]; +tcp6_l2_buf[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; static size_t tcp6_l2_buf_bytes; /* recvmsg()/sendmsg() data for tap */ static char tcp_buf_discard [MAX_WINDOW]; -static struct iovec iov_sock [TCP_TAP_FRAMES_MEM + 1]; +static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; -static struct iovec tcp4_l2_iov_tap [TCP_TAP_FRAMES_MEM]; -static struct iovec tcp6_l2_iov_tap [TCP_TAP_FRAMES_MEM]; -static struct iovec tcp4_l2_flags_iov_tap [TCP_TAP_FRAMES_MEM]; -static struct iovec tcp6_l2_flags_iov_tap [TCP_TAP_FRAMES_MEM]; +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; -static struct mmsghdr tcp_l2_mh_tap [TCP_TAP_FRAMES_MEM]; +static struct mmsghdr tcp_l2_mh [TCP_FRAMES_MEM]; /* sendmsg() to socket */ -static struct iovec tcp_tap_iov [UIO_MAXIOV]; +static struct iovec tcp_iov [UIO_MAXIOV]; /** * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) @@ -690,9 +623,10 @@ static struct tcp4_l2_flags_buf_t { #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif -tcp4_l2_flags_buf[TCP_TAP_FRAMES_MEM]; +tcp4_l2_flags_buf[TCP_FRAMES_MEM]; -static int tcp4_l2_flags_buf_used; +static unsigned int tcp4_l2_flags_buf_used; +static size_t tcp4_l2_flags_buf_bytes; /** * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) @@ -719,34 +653,202 @@ static struct tcp6_l2_flags_buf_t { #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif -tcp6_l2_flags_buf[TCP_TAP_FRAMES_MEM]; - -static int tcp6_l2_flags_buf_used; +tcp6_l2_flags_buf[TCP_FRAMES_MEM]; -/* SO_RCVLOWAT set on source ([0]) or destination ([1]) socket, and activity */ -static uint8_t splice_rcvlowat_set[MAX_SPLICE_CONNS / 8][2]; -static uint8_t splice_rcvlowat_act[MAX_SPLICE_CONNS / 8][2]; +static unsigned int tcp6_l2_flags_buf_used; +static size_t tcp6_l2_flags_buf_bytes; /* TCP connections */ -static struct tcp_tap_conn tt[MAX_TAP_CONNS]; -static struct tcp_splice_conn ts[MAX_SPLICE_CONNS]; +static struct tcp_conn tc[MAX_TAP_CONNS]; /* Table for lookup from remote address, local port, remote port */ -static struct tcp_tap_conn *tt_hash[TCP_HASH_TABLE_SIZE]; +static struct tcp_conn *tc_hash[TCP_HASH_TABLE_SIZE]; + +/* Pools for pre-opened sockets */ +int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; +int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +int ns_sock_pool4 [TCP_SOCK_POOL_SIZE]; +int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; + +/** + * tcp_conn_epoll_events() - epoll events mask for given connection state + * @events: Current connection events + * @conn_flags Connection flags + * + * Return: epoll events mask corresponding to implied connection state + */ +static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) +{ + if (!events) + return 0; + + if (events & ESTABLISHED) { + if (events & TAP_FIN_SENT) + return EPOLLET; + + if (conn_flags & CONN_STALLED) + return EPOLLIN | EPOLLRDHUP | EPOLLET; + + return EPOLLIN | EPOLLRDHUP; + } + + if (events == TAP_SYN_RCVD) + return EPOLLOUT | EPOLLET | EPOLLRDHUP; + + return EPOLLRDHUP; +} + +static void conn_flag_do(struct ctx *c, struct tcp_conn *conn, + unsigned long flag); +#define conn_flag(c, conn, flag) \ + do { \ + trace("TCP: flag at %s:%i", __func__, __LINE__); \ + conn_flag_do(c, conn, flag); \ + } while (0) + +/** + * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events + * @c: Execution context + * @conn: Connection pointer + * + * Return: 0 on success, negative error code on failure (not on deletion) + */ +static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn) +{ + int m = (conn->flags & CONN_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, + .r.p.tcp.tcp.index = conn - tc, + .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + struct epoll_event ev = { .data.u64 = ref.u64 }; + + if (CONN_IS_CLOSED(conn)) { + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); + return 0; + } + + ev.events = tcp_conn_epoll_events(conn->events, conn->flags); + + if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) + return -errno; + + conn->flags |= CONN_IN_EPOLL; /* No need to log this */ + + return 0; +} + +/** + * conn_flag_do() - Set/unset given flag, log, update epoll on CONN_STALLED + * @c: Execution context + * @conn: Connection pointer + * @flag: Flag to set, or ~flag to unset + */ +static void conn_flag_do(struct ctx *c, struct tcp_conn *conn, + unsigned long flag) +{ + if (flag & (flag - 1)) { + if (!(conn->flags & ~flag)) + return; + + conn->flags &= flag; + debug("TCP: index %i: %s dropped", (conn) - tc, + tcp_flag_str[fls(~flag)]); + } else { + if (conn->flags & flag) + return; + + conn->flags |= flag; + debug("TCP: index %i: %s", (conn) - tc, + tcp_flag_str[fls(flag)]); + } + + if (flag == CONN_STALLED || flag == ~CONN_STALLED) + tcp_epoll_ctl(c, conn); +} + +/** + * conn_event_do() - Set and log connection events, update epoll state + * @c: Execution context + * @conn: Connection pointer + * @event: Connection event + */ +static void conn_event_do(struct ctx *c, struct tcp_conn *conn, + unsigned long event) +{ + int prev, new, num = fls(event); + + if (conn->events & event) + return; + + prev = fls(conn->events); + if (conn->flags & CONN_ACTIVE_CLOSE) + prev += 5; + + if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) + prev++; /* i.e. SOCK_FIN_RCVD, not TAP_SYN_ACK_SENT */ + + if (event == CLOSED || (event & CONN_STATE_BITS)) + conn->events = event; + else + conn->events |= event; + + if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) + conn_flag(c, conn, CONN_ACTIVE_CLOSE); + else + tcp_epoll_ctl(c, conn); + + new = fls(conn->events); + + if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) { + num++; + new++; + } + if (conn->flags & CONN_ACTIVE_CLOSE) + new += 5; + + if (prev != new) { + debug("TCP: index %i, %s: %s -> %s", (conn) - tc, + num == -1 ? "CLOSED" : tcp_event_str[num], + prev == -1 ? "CLOSED" : tcp_state_str[prev], + (new == -1 || num == -1) ? "CLOSED" : tcp_state_str[new]); + } else { + debug("TCP: index %i, %s", (conn) - tc, + num == -1 ? "CLOSED" : tcp_event_str[num]); + } +} + +#define conn_event(c, conn, event) \ + do { \ + trace("TCP: event at %s:%i", __func__, __LINE__); \ + conn_event_do(c, conn, event); \ + } while (0) + +/** + * tcp_remap_to_tap() - Set delta for port translation toward guest/tap + * @port: Original destination port, host order + * @delta: Delta to be added to original destination port + */ +void tcp_remap_to_tap(in_port_t port, in_port_t delta) +{ + tcp_port_delta_to_tap[port] = delta; +} -/* Pools for pre-opened sockets and pipes */ -static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; -static int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; -static int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; -static int ns_sock_pool4 [TCP_SOCK_POOL_SIZE]; -static int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; +/** + * tcp_remap_to_tap() - Set delta for port translation toward init namespace + * @port: Original destination port, host order + * @delta: Delta to be added to original destination port + */ +void tcp_remap_to_init(in_port_t port, in_port_t delta) +{ + tcp_port_delta_to_init[port] = delta; +} /** * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint * @conn: Connection pointer + * * Return: 1 if destination is in low RTT table, 0 otherwise */ -static int tcp_rtt_dst_low(struct tcp_tap_conn *conn) +static int tcp_rtt_dst_low(struct tcp_conn *conn) { int i; @@ -762,7 +864,7 @@ static int tcp_rtt_dst_low(struct tcp_tap_conn *conn) * @conn: Connection pointer * @tinfo: Pointer to struct tcp_info for socket */ -static void tcp_rtt_dst_check(struct tcp_tap_conn *conn, struct tcp_info *tinfo) +static void tcp_rtt_dst_check(struct tcp_conn *conn, struct tcp_info *tinfo) { #ifdef HAS_MIN_RTT int i, hole = -1; @@ -788,35 +890,11 @@ static void tcp_rtt_dst_check(struct tcp_tap_conn *conn, struct tcp_info *tinfo) #endif /* HAS_MIN_RTT */ } -/** - * tcp_tap_state() - Set given TCP state for tap connection, report to stderr - * @conn: Connection pointer - * @state: New TCP state to be set - */ -static void tcp_tap_state(struct tcp_tap_conn *conn, enum tcp_state state) -{ - debug("TCP: socket %i: %s -> %s", - conn->sock, tcp_state_str[conn->state], tcp_state_str[state]); - conn->state = state; -} - -/** - * tcp_splice_state() - Set state for spliced connection, report to stderr - * @conn: Connection pointer - * @state: New TCP state to be set - */ -static void tcp_splice_state(struct tcp_splice_conn *conn, enum tcp_state state) -{ - debug("TCP: index %i: %s -> %s", - conn - ts, tcp_state_str[conn->state], tcp_state_str[state]); - conn->state = state; -} - /** * tcp_get_sndbuf() - Get, scale SO_SNDBUF between thresholds (1 to 0.5 usage) * @conn: Connection pointer */ -static void tcp_get_sndbuf(struct tcp_tap_conn *conn) +static void tcp_get_sndbuf(struct tcp_conn *conn) { int s = conn->sock, sndbuf; socklen_t sl; @@ -841,7 +919,7 @@ static void tcp_get_sndbuf(struct tcp_tap_conn *conn) * tcp_sock_set_bufsize() - Set SO_RCVBUF and SO_SNDBUF to maximum values * @s: Socket, can be -1 to avoid check in the caller */ -static void tcp_sock_set_bufsize(struct ctx *c, int s) +void tcp_sock_set_bufsize(struct ctx *c, int s) { int v = INT_MAX / 2; /* Kernel clamps and rounds, no need to check */ @@ -918,7 +996,7 @@ void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s, { int i; - for (i = 0; i < TCP_TAP_FRAMES_MEM; i++) { + for (i = 0; i < TCP_FRAMES_MEM; i++) { struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; @@ -984,13 +1062,12 @@ static void tcp_sock4_iov_init(void) }; } - for (i = 0, iov = tcp4_l2_iov_tap; i < TCP_TAP_FRAMES_MEM; i++, iov++) { + for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) { iov->iov_base = &tcp4_l2_buf[i].vnet_len; iov->iov_len = MSS_DEFAULT; } - for (i = 0, iov = tcp4_l2_flags_iov_tap; i < TCP_TAP_FRAMES_MEM; - i++, iov++) + for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) iov->iov_base = &tcp4_l2_flags_buf[i].vnet_len; } @@ -1018,13 +1095,12 @@ static void tcp_sock6_iov_init(void) }; } - for (i = 0, iov = tcp6_l2_iov_tap; i < TCP_TAP_FRAMES_MEM; i++, iov++) { + for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) { iov->iov_base = &tcp6_l2_buf[i].vnet_len; iov->iov_len = MSS_DEFAULT; } - for (i = 0, iov = tcp6_l2_flags_iov_tap; i < TCP_TAP_FRAMES_MEM; - i++, iov++) + for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) iov->iov_base = &tcp6_l2_flags_buf[i].vnet_len; } @@ -1032,13 +1108,13 @@ static void tcp_sock6_iov_init(void) * tcp_opt_get() - Get option, and value if any, from TCP header * @th: Pointer to TCP header * @len: Length of buffer, including TCP header - * @type: Option type to look for + * @type_find: Option type to look for * @optlen_set: Optional, filled with option length if passed * @value_set: Optional, set to start of option value if passed * - * Return: Option value, meaningful for up to 4 bytes, -1 if not found + * Return: option value, meaningful for up to 4 bytes, -1 if not found */ -static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_search, +static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find, uint8_t *optlen_set, char **value_set) { uint8_t type, optlen; @@ -1062,7 +1138,7 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_search, optlen = *(p++) - 2; len -= 2; - if (type != type_search) + if (type != type_find) break; if (optlen_set) @@ -1096,7 +1172,7 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_search, * * Return: 1 on match, 0 otherwise */ -static int tcp_hash_match(struct tcp_tap_conn *conn, int af, void *addr, +static int tcp_hash_match(struct tcp_conn *conn, int af, void *addr, in_port_t tap_port, in_port_t sock_port) { if (af == AF_INET && CONN_V4(conn) && @@ -1136,9 +1212,7 @@ static unsigned int tcp_hash(struct ctx *c, int af, void *addr, in_port_t tap_port; in_port_t sock_port; } __attribute__((__packed__)) in = { - .addr = *(struct in_addr *)addr, - .tap_port = tap_port, - .sock_port = sock_port, + *(struct in_addr *)addr, tap_port, sock_port, }; b = siphash_8b((uint8_t *)&in, c->tcp.hash_secret); @@ -1148,9 +1222,7 @@ static unsigned int tcp_hash(struct ctx *c, int af, void *addr, in_port_t tap_port; in_port_t sock_port; } __attribute__((__packed__)) in = { - .addr = *(struct in6_addr *)addr, - .tap_port = tap_port, - .sock_port = sock_port, + *(struct in6_addr *)addr, tap_port, sock_port, }; b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret); @@ -1166,41 +1238,41 @@ static unsigned int tcp_hash(struct ctx *c, int af, void *addr, * @af: Address family, AF_INET or AF_INET6 * @addr: Remote address, pointer to sin_addr or sin6_addr */ -static void tcp_hash_insert(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_hash_insert(struct ctx *c, struct tcp_conn *conn, int af, void *addr) { int b; b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port); - conn->next = tt_hash[b]; - tt_hash[b] = conn; + conn->next = tc_hash[b]; + tc_hash[b] = conn; conn->hash_bucket = b; debug("TCP: hash table insert: index %i, sock %i, bucket: %i, next: %p", - conn - tt, conn->sock, b, conn->next); + conn - tc, conn->sock, b, conn->next); } /** * tcp_hash_remove() - Drop connection from hash table, chain unlink * @conn: Connection pointer */ -static void tcp_hash_remove(struct tcp_tap_conn *conn) +static void tcp_hash_remove(struct tcp_conn *conn) { - struct tcp_tap_conn *entry, *prev = NULL; + struct tcp_conn *entry, *prev = NULL; int b = conn->hash_bucket; - for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) { + for (entry = tc_hash[b]; entry; prev = entry, entry = entry->next) { if (entry == conn) { if (prev) prev->next = conn->next; else - tt_hash[b] = conn->next; + tc_hash[b] = conn->next; break; } } debug("TCP: hash table remove: index %i, sock %i, bucket: %i, new: %p", - conn - tt, conn->sock, b, prev ? prev->next : tt_hash[b]); + conn - tc, conn->sock, b, prev ? prev->next : tc_hash[b]); } /** @@ -1208,24 +1280,24 @@ static void tcp_hash_remove(struct tcp_tap_conn *conn) * @old: Old connection pointer * @new: New connection pointer */ -static void tcp_hash_update(struct tcp_tap_conn *old, struct tcp_tap_conn *new) +static void tcp_hash_update(struct tcp_conn *old, struct tcp_conn *new) { - struct tcp_tap_conn *entry, *prev = NULL; + struct tcp_conn *entry, *prev = NULL; int b = old->hash_bucket; - for (entry = tt_hash[b]; entry; prev = entry, entry = entry->next) { + for (entry = tc_hash[b]; entry; prev = entry, entry = entry->next) { if (entry == old) { if (prev) prev->next = new; else - tt_hash[b] = new; + tc_hash[b] = new; break; } } debug("TCP: hash table update: old index %i, new index %i, sock %i, " "bucket: %i, old: %p, new: %p", - old - tt, new - tt, new->sock, b, old, new); + old - tc, new - tc, new->sock, b, old, new); } /** @@ -1238,14 +1310,13 @@ static void tcp_hash_update(struct tcp_tap_conn *old, struct tcp_tap_conn *new) * * Return: connection pointer, if found, -ENOENT otherwise */ -static struct tcp_tap_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr, - in_port_t tap_port, - in_port_t sock_port) +static struct tcp_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr, + in_port_t tap_port, in_port_t sock_port) { int b = tcp_hash(c, af, addr, tap_port, sock_port); - struct tcp_tap_conn *conn; + struct tcp_conn *conn; - for (conn = tt_hash[b]; conn; conn = conn->next) { + for (conn = tc_hash[b]; conn; conn = conn->next) { if (tcp_hash_match(conn, af, addr, tap_port, sock_port)) return conn; } @@ -1254,70 +1325,46 @@ static struct tcp_tap_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr, } /** - * tcp_tap_epoll_mask() - Set new epoll event mask given a connection - * @c: Execution context - * @conn: Connection pointer - * @events: New epoll event bitmap - */ -static void tcp_tap_epoll_mask(struct ctx *c, struct tcp_tap_conn *conn, - uint32_t events) -{ - union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, - .r.p.tcp.tcp.index = conn - tt, - .r.p.tcp.tcp.v6 = CONN_V6(conn) }; - struct epoll_event ev = { .data.u64 = ref.u64, .events = events }; - - if (conn->events == events) - return; - - conn->events = events; - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->sock, &ev); -} - -/** - * tcp_table_tap_compact() - Perform compaction on tap connection table + * tcp_table_compact() - Perform compaction on connection table * @c: Execution context * @hole: Pointer to recently closed connection */ -static void tcp_table_tap_compact(struct ctx *c, struct tcp_tap_conn *hole) +static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) { - struct tcp_tap_conn *from, *to; - uint32_t events; + struct tcp_conn *from, *to; - if ((hole - tt) == --c->tcp.tap_conn_count) { + if ((hole - tc) == --c->tcp.conn_count) { debug("TCP: hash table compaction: index %i (%p) was max index", - hole - tt, hole); + hole - tc, hole); return; } - from = &tt[c->tcp.tap_conn_count]; + from = CONN(c->tcp.conn_count); memcpy(hole, from, sizeof(*hole)); - from->state = CLOSED; + from->flags = from->events = 0; to = hole; tcp_hash_update(from, to); - events = hole->events; - hole->events = UINT_MAX; - tcp_tap_epoll_mask(c, hole, events); + tcp_epoll_ctl(c, to); debug("TCP: hash table compaction: old index %i, new index %i, " "sock %i, from: %p, to: %p", - from - tt, to - tt, from->sock, from, to); + from - tc, to - tc, from->sock, from, to); } /** - * tcp_tap_destroy() - Close tap connection, drop from hash table and epoll + * tcp_conn_destroy() - Close connection, drop from epoll file descriptor * @c: Execution context * @conn: Connection pointer */ -static void tcp_tap_destroy(struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn) { - if (conn->state == CLOSED) + if (CONN_IS_CLOSED(conn)) return; - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL); - tcp_tap_state(conn, CLOSED); + conn_event(c, conn, CLOSED); + conn->flags = 0; close(conn->sock); /* Removal from hash table and connection table compaction deferred to @@ -1325,50 +1372,33 @@ static void tcp_tap_destroy(struct ctx *c, struct tcp_tap_conn *conn) */ } -static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn); +static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn); +#define tcp_rst(c, conn) \ + do { \ + debug("TCP: index %i, reset at %s:%i", conn - tc, \ + __func__, __LINE__); \ + tcp_rst_do(c, conn); \ + } while (0) /** - * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags) + * tcp_l2_buf_write_one() - Write a single buffer to tap file descriptor * @c: Execution context + * @iov: struct iovec item pointing to buffer + * @ts: Current timestamp + * + * Return: 0 on success, negative error code on failure (tap reset possible) */ -static void tcp_l2_flags_buf_flush(struct ctx *c) +static int tcp_l2_buf_write_one(struct ctx *c, struct iovec *iov, + struct timespec *ts) { - struct msghdr mh = { 0 }; - size_t i; - - mh.msg_iov = tcp6_l2_flags_iov_tap; - if ((mh.msg_iovlen = tcp6_l2_flags_buf_used)) { - if (c->mode == MODE_PASST) { - sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); - } else { - for (i = 0; i < mh.msg_iovlen; i++) { - struct iovec *iov = &mh.msg_iov[i]; - - if (write(c->fd_tap, (char *)iov->iov_base + 4, - iov->iov_len - 4) < 0) - debug("tap write: %s", strerror(errno)); - } - } - tcp6_l2_flags_buf_used = 0; - pcapm(&mh); + if (write(c->fd_tap, (char *)iov->iov_base + 4, iov->iov_len - 4) < 0) { + debug("tap write: %s", strerror(errno)); + if (errno != EAGAIN && errno != EWOULDBLOCK) + tap_handler(c, c->fd_tap, EPOLLERR, ts); + return -errno; } - mh.msg_iov = tcp4_l2_flags_iov_tap; - if ((mh.msg_iovlen = tcp4_l2_flags_buf_used)) { - if (c->mode == MODE_PASST) { - sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); - } else { - for (i = 0; i < mh.msg_iovlen; i++) { - struct iovec *iov = &mh.msg_iov[i]; - - if (write(c->fd_tap, (char *)iov->iov_base + 4, - iov->iov_len - 4) < 0) - debug("tap write: %s", strerror(errno)); - } - } - tcp4_l2_flags_buf_used = 0; - pcapm(&mh); - } + return 0; } /** @@ -1396,65 +1426,91 @@ static void tcp_l2_buf_flush_part(struct ctx *c, struct msghdr *mh, size_t sent) } /** - * tcp_l2_flags_buf() - Send out buffers for segments with data + * tcp_l2_flags_buf_flush() - Send out buffers for segments with or without data * @c: Execution context - */ -static void tcp_l2_buf_flush(struct ctx *c) + * @mh: Message header pointing to buffers, msg_iovlen not set + * @buf_used: Pointer to count of used buffers, set to 0 on return + * @buf_bytes: Pointer to count of buffer bytes, set to 0 on return + * @ts: Current timestamp + */ +static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, + unsigned int *buf_used, size_t *buf_bytes, + struct timespec *ts) { - struct msghdr mh = { 0 }; - size_t i, n; - - mh.msg_iov = tcp6_l2_iov_tap; - if (!(mh.msg_iovlen = tcp6_l2_buf_used)) - goto v4; + if (!(mh->msg_iovlen = *buf_used)) + return; if (c->mode == MODE_PASST) { - n = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); - if (n > 0 && n < tcp6_l2_buf_bytes) - tcp_l2_buf_flush_part(c, &mh, n); + size_t n = sendmsg(c->fd_tap, mh, MSG_NOSIGNAL | MSG_DONTWAIT); + if (n > 0 && n < *buf_bytes) + tcp_l2_buf_flush_part(c, mh, n); } else { - for (i = 0; i < mh.msg_iovlen; i++) { - struct iovec *iov = &mh.msg_iov[i]; + size_t i; - if (write(c->fd_tap, (char *)iov->iov_base + 4, - iov->iov_len - 4) < 0) - debug("tap write: %s", strerror(errno)); + for (i = 0; i < mh->msg_iovlen; i++) { + struct iovec *iov = &mh->msg_iov[i]; + + if (tcp_l2_buf_write_one(c, iov, ts)) + i--; } } - tcp6_l2_buf_used = tcp6_l2_buf_bytes = 0; - pcapm(&mh); - -v4: - mh.msg_iov = tcp4_l2_iov_tap; - if (!(mh.msg_iovlen = tcp4_l2_buf_used)) - return; - - if (c->mode == MODE_PASST) { - n = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); + *buf_used = *buf_bytes = 0; + pcapm(mh); +} - if (n > 0 && n < tcp4_l2_buf_bytes) - tcp_l2_buf_flush_part(c, &mh, n); - } else { - for (i = 0; i < mh.msg_iovlen; i++) { - struct iovec *iov = &mh.msg_iov[i]; +/** + * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags) + * @c: Execution context + * @ts: Current timestamp (not packet timestamp) + */ +static void tcp_l2_flags_buf_flush(struct ctx *c, struct timespec *ts) +{ + struct msghdr mh = { 0 }; + unsigned int *buf_used; + size_t *buf_bytes; + + mh.msg_iov = tcp6_l2_flags_iov; + buf_used = &tcp6_l2_flags_buf_used; + buf_bytes = &tcp6_l2_flags_buf_bytes; + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + + mh.msg_iov = tcp4_l2_flags_iov; + buf_used = &tcp4_l2_flags_buf_used; + buf_bytes = &tcp4_l2_flags_buf_bytes; + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); +} - if (write(c->fd_tap, (char *)iov->iov_base + 4, - iov->iov_len - 4) < 0) - debug("tap write: %s", strerror(errno)); - } - } - tcp4_l2_buf_used = tcp4_l2_buf_bytes = 0; - pcapm(&mh); +/** + * tcp_l2_data_buf_flush() - Send out buffers for segments with data + * @c: Execution context + * @ts: Current timestamp (not packet timestamp) + */ +static void tcp_l2_data_buf_flush(struct ctx *c, struct timespec *ts) +{ + struct msghdr mh = { 0 }; + unsigned int *buf_used; + size_t *buf_bytes; + + mh.msg_iov = tcp6_l2_iov; + buf_used = &tcp6_l2_buf_used; + buf_bytes = &tcp6_l2_buf_bytes; + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + + mh.msg_iov = tcp4_l2_iov; + buf_used = &tcp4_l2_buf_used; + buf_bytes = &tcp4_l2_buf_bytes; + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); } /** * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context + * @now: Current timestamp */ -void tcp_defer_handler(struct ctx *c) +void tcp_defer_handler(struct ctx *c, struct timespec *now) { - tcp_l2_flags_buf_flush(c); - tcp_l2_buf_flush(c); + tcp_l2_flags_buf_flush(c, now); + tcp_l2_data_buf_flush(c, now); } /** @@ -1466,9 +1522,9 @@ void tcp_defer_handler(struct ctx *c) * @check: Checksum, if already known * @seq: Sequence number for this segment * - * Return: 802.3 length, host order. + * Return: 802.3 length, host order */ -static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_tap_conn *conn, +static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn, void *p, size_t plen, const uint16_t *check, uint32_t seq) { @@ -1549,13 +1605,13 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_tap_conn *conn, * tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap * @c: Execution context * @conn: Connection pointer - * @flags: TCP header flags we are about to send, if any + * @force_seq: Force ACK sequence to latest segment, instead of checking socket * @tinfo: tcp_info from kernel, can be NULL if not pre-fetched * * Return: 1 if sequence or window were updated, 0 otherwise */ -static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_tap_conn *conn, - int flags, struct tcp_info *tinfo) +static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, + int force_seq, struct tcp_info *tinfo) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; @@ -1564,15 +1620,14 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_tap_conn *conn, int s = conn->sock; #ifndef HAS_BYTES_ACKED - (void)flags; + (void)force_seq; conn->seq_ack_to_tap = conn->seq_from_tap; if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) conn->seq_ack_to_tap = prev_ack_to_tap; #else - if (conn->state > ESTABLISHED || (flags & (DUP_ACK | FORCE_ACK)) || - conn->local || tcp_rtt_dst_low(conn) || - (unsigned long)conn->snd_buf < SNDBUF_SMALL) { + if ((unsigned long)conn->snd_buf < SNDBUF_SMALL || tcp_rtt_dst_low(conn) + || CONN_IS_CLOSING(conn) || conn->flags & CONN_LOCAL || force_seq) { conn->seq_ack_to_tap = conn->seq_from_tap; } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { if (!tinfo) { @@ -1605,7 +1660,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_tap_conn *conn, } #ifdef HAS_SND_WND - if (conn->local || tcp_rtt_dst_low(conn)) { + if ((conn->flags & CONN_LOCAL) || tcp_rtt_dst_low(conn)) { conn->wnd_to_tap = tinfo->tcpi_snd_wnd; } else { tcp_get_sndbuf(conn); @@ -1621,16 +1676,16 @@ out: } /** - * tcp_send_to_tap() - Send segment to tap, with options and values from socket + * tcp_send_flag() - Send segment with flags to tap (no payload) * @c: Execution context * @conn: Connection pointer - * @flags: TCP flags to set - * @now: Current timestamp, can be NULL + * @flags: TCP flags: if not set, send segment only if ACK is due + * @now: Current timestamp * * Return: negative error code on connection reset, 0 otherwise */ -static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags, - struct timespec *now) +static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags, + struct timespec *now) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; @@ -1650,26 +1705,26 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags, return 0; if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { - tcp_tap_destroy(c, conn); + tcp_conn_destroy(c, conn); return -ECONNRESET; } - if (!conn->local) + if (!(conn->flags & CONN_LOCAL)) tcp_rtt_dst_check(conn, &tinfo); if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) return 0; if (CONN_V4(conn)) { - iov = tcp4_l2_flags_iov_tap + tcp4_l2_flags_buf_used; - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; + iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; + p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; th = &b4->th; /* gcc 11.2 would complain on data = (char *)(th + 1); */ data = b4->opts; } else { - iov = tcp6_l2_flags_iov_tap + tcp6_l2_flags_buf_used; - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; + iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; + p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; th = &b6->th; data = b6->opts; } @@ -1693,7 +1748,8 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags, mss -= sizeof(struct ipv6hdr); if (c->low_wmem && - !conn->local && !tcp_rtt_dst_low(conn)) + !(conn->flags & CONN_LOCAL) && + !tcp_rtt_dst_low(conn)) mss = MIN(mss, PAGE_SIZE); else if (mss > PAGE_SIZE) mss = ROUND_DOWN(mss, PAGE_SIZE); @@ -1719,7 +1775,7 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags, conn->wnd_to_tap = WINDOW_DEFAULT; } else { - th->ack = !!(flags & (ACK | FORCE_ACK | DUP_ACK)) || + th->ack = !!(flags & (ACK | DUP_ACK)) || conn->seq_ack_to_tap != prev_ack_to_tap || !prev_wnd_to_tap; } @@ -1734,6 +1790,11 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags, NULL, conn->seq_to_tap); iov->iov_len = eth_len + sizeof(uint32_t); + if (CONN_V4(conn)) + tcp4_l2_flags_buf_bytes += iov->iov_len; + else + tcp6_l2_flags_buf_bytes += iov->iov_len; + if (th->ack && now) conn->ts_ack_to_tap = *now; @@ -1749,35 +1810,38 @@ static int tcp_send_to_tap(struct ctx *c, struct tcp_tap_conn *conn, int flags, memcpy(b4 + 1, b4, sizeof(*b4)); (iov + 1)->iov_len = iov->iov_len; tcp4_l2_flags_buf_used++; + tcp4_l2_flags_buf_bytes += iov->iov_len; } if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); + tcp_l2_flags_buf_flush(c, now); } else { if (flags & DUP_ACK) { memcpy(b6 + 1, b6, sizeof(*b6)); (iov + 1)->iov_len = iov->iov_len; tcp6_l2_flags_buf_used++; + tcp6_l2_flags_buf_bytes += iov->iov_len; } + if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c); + tcp_l2_flags_buf_flush(c, now); } return 0; } /** - * tcp_rst() - Reset a tap connection: send RST segment to tap, close socket + * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket * @c: Execution context * @conn: Connection pointer */ -static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn) +static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn) { - if (conn->state == CLOSED) + if (CONN_IS_CLOSED(conn)) return; - tcp_send_to_tap(c, conn, RST, NULL); - tcp_tap_destroy(c, conn); + if (!tcp_send_flag(c, conn, RST, NULL)) + tcp_conn_destroy(c, conn); } /** @@ -1788,8 +1852,9 @@ static void tcp_rst(struct ctx *c, struct tcp_tap_conn *conn) * @window: Window value, host order, unscaled, if no header is passed * @init: Set if this is the very first segment from tap */ -static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th, - int len, unsigned int window, int init) +static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, + struct tcphdr *th, int len, unsigned int window, + int init) { if (init && th) { int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); @@ -1801,7 +1866,6 @@ static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th, * small window now. */ conn->wnd_from_tap = ntohs(th->window); - conn->window_clamped = 0; } else { if (th) window = ntohs(th->window) << conn->ws_tap; @@ -1810,7 +1874,7 @@ static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th, window = MIN(MAX_WINDOW, window); - if (conn->window_clamped) { + if (conn->flags & CONN_WND_CLAMPED) { if (conn->wnd_from_tap == window) return; @@ -1829,7 +1893,7 @@ static void tcp_clamp_window(struct tcp_tap_conn *conn, struct tcphdr *th, window = 256; setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &window, sizeof(window)); - conn->window_clamped = 1; + conn_flag(c, conn, CONN_WND_CLAMPED); } } @@ -1887,70 +1951,111 @@ static uint32_t tcp_seq_init(struct ctx *c, int af, void *addr, } /** - * tcp_conn_from_tap() - Handle connection request (SYN segment) from tap + * tcp_conn_new_sock() - Get socket for new connection from pool or make new one * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @addr: Remote address, pointer to sin_addr or sin6_addr - * @th: TCP header from tap - * @len: Packet length at L4 - * @now: Current timestamp + * @af: Address family + * + * Return: socket number if available, negative code if socket creation failed */ -static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, - struct tcphdr *th, size_t len, - struct timespec *now) +static int tcp_conn_new_sock(struct ctx *c, sa_family_t af) { - union epoll_ref ref = { .r.proto = IPPROTO_TCP }; - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = th->dest, - .sin_addr = *(struct in_addr *)addr, - }; - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = th->dest, - .sin6_addr = *(struct in6_addr *)addr, - }; - int i, s, *sock_pool_p, mss; - const struct sockaddr *sa; - struct tcp_tap_conn *conn; - struct epoll_event ev; - socklen_t sl; + int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4, i, s; - if (c->tcp.tap_conn_count >= MAX_TAP_CONNS) - return; - - for (i = 0; i < TCP_SOCK_POOL_SIZE; i++) { - if (af == AF_INET6) - sock_pool_p = &init_sock_pool6[i]; - else - sock_pool_p = &init_sock_pool4[i]; - if ((ref.r.s = s = (*sock_pool_p)) >= 0) { - *sock_pool_p = -1; + for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, pool++) { + if ((s = *pool) >= 0) { + *pool = -1; break; } } - if (s < 0) { + if (s < 0) s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); - ref.r.s = s; - } if (s < 0) - return; + return -errno; tcp_sock_set_bufsize(c, s); - if (af == AF_INET && addr4.sin_addr.s_addr == c->gw4 && !c->no_map_gw) - addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - else if (af == AF_INET6 && !memcmp(addr, &c->gw6, sizeof(c->gw6)) && - !c->no_map_gw) - addr6.sin6_addr = in6addr_loopback; + return s; +} - if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) { - struct sockaddr_in6 addr6_ll = { - .sin6_family = AF_INET6, - .sin6_addr = c->addr6_ll, - .sin6_scope_id = c->ifi, +/** + * tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest + * @c: Execution context + * @conn: Connection pointer + * @th: TCP header send by tap/guest + * @len: L4 packet length, host order + * + * Return: clamped MSS value + */ +static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn, + struct tcphdr *th, size_t len) +{ + unsigned int mss; + int ret; + + if ((ret = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0) + mss = MSS_DEFAULT; + else + mss = ret; + + /* Don't upset qemu */ + if (c->mode == MODE_PASST) { + if (CONN_V4(conn)) + mss = MIN(MSS4, mss); + else + mss = MIN(MSS6, mss); + } + + return MIN(mss, USHRT_MAX); +} + +/** + * tcp_conn_from_tap() - Handle connection request (SYN segment) from tap + * @c: Execution context + * @af: Address family, AF_INET or AF_INET6 + * @addr: Remote address, pointer to sin_addr or sin6_addr + * @th: TCP header from tap + * @len: Packet length at L4 + * @now: Current timestamp + */ +static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, + struct tcphdr *th, size_t len, + struct timespec *now) +{ + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = th->dest, + .sin_addr = *(struct in_addr *)addr, + }; + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = th->dest, + .sin6_addr = *(struct in6_addr *)addr, + }; + const struct sockaddr *sa; + struct tcp_conn *conn; + socklen_t sl; + int s; + + if (c->tcp.conn_count >= TCP_MAX_CONNS) + return; + + if ((s = tcp_conn_new_sock(c, af)) < 0) + return; + + if (!c->no_map_gw) { + if (af == AF_INET && addr4.sin_addr.s_addr == c->gw4) + addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + if (af == AF_INET6 && !memcmp(addr, &c->gw6, sizeof(c->gw6))) + addr6.sin6_addr = in6addr_loopback; + } + + if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) { + struct sockaddr_in6 addr6_ll = { + .sin6_family = AF_INET6, + .sin6_addr = c->addr6_ll, + .sin6_scope_id = c->ifi, }; if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) { close(s); @@ -1958,29 +2063,18 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, } } - conn = &tt[c->tcp.tap_conn_count++]; + conn = CONN(c->tcp.conn_count++); conn->sock = s; - conn->events = 0; + conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; - if ((mss = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0) - conn->mss_guest = MSS_DEFAULT; - else - conn->mss_guest = mss; - - /* Don't upset qemu */ - if (c->mode == MODE_PASST) { - if (af == AF_INET) - conn->mss_guest = MIN(MSS4, conn->mss_guest); - else - conn->mss_guest = MIN(MSS6, conn->mss_guest); - } + conn->tap_mss = tcp_conn_tap_mss(c, conn, th, len); - sl = sizeof(conn->mss_guest); - setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->mss_guest, sl); + sl = sizeof(conn->tap_mss); + setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->tap_mss, sl); - tcp_clamp_window(conn, th, len, 0, 1); + tcp_clamp_window(c, conn, th, len, 0, 1); if (af == AF_INET) { sa = (struct sockaddr *)&addr4; @@ -2015,162 +2109,86 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, if (!bind(s, sa, sl)) tcp_rst(c, conn); /* Nobody is listening then */ if (errno != EADDRNOTAVAIL) - conn->local = 1; + conn_flag(c, conn, CONN_LOCAL); if (connect(s, sa, sl)) { - tcp_tap_state(conn, TAP_SYN_SENT); - if (errno != EINPROGRESS) { tcp_rst(c, conn); return; } - ev.events = EPOLLOUT | EPOLLRDHUP; - tcp_get_sndbuf(conn); } else { - tcp_tap_state(conn, TAP_SYN_RCVD); - tcp_get_sndbuf(conn); - if (tcp_send_to_tap(c, conn, SYN | ACK, now)) + if (tcp_send_flag(c, conn, SYN | ACK, now)) return; - ev.events = EPOLLIN | EPOLLRDHUP; + conn_event(c, conn, TAP_SYN_ACK_SENT); } - conn->events = ev.events; - ref.r.p.tcp.tcp.index = conn - tt; - ev.data.u64 = ref.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); + tcp_epoll_ctl(c, conn); } /** - * tcp_table_splice_compact - Compact spliced connection table - * @c: Execution context - * @hole: Pointer to recently closed connection + * tcp_sock_consume() - Consume (discard) data from buffer, update ACK sequence + * @conn: Connection pointer + * @ack_seq: ACK sequence, host order + * + * Return: 0 on success, negative error code from recv() on failure */ -static void tcp_table_splice_compact(struct ctx *c, - struct tcp_splice_conn *hole) +static int tcp_sock_consume(struct tcp_conn *conn, uint32_t ack_seq) { - union epoll_ref ref_from = { .r.proto = IPPROTO_TCP, - .r.p.tcp.tcp.splice = 1, - .r.p.tcp.tcp.index = hole - ts }; - union epoll_ref ref_to = { .r.proto = IPPROTO_TCP, - .r.p.tcp.tcp.splice = 1, - .r.p.tcp.tcp.index = hole - ts }; - struct tcp_splice_conn *move; - struct epoll_event ev_from; - struct epoll_event ev_to; - - hole->from_fin_sent = hole->to_fin_sent = 0; - hole->from_read = hole->from_written = 0; - hole->to_read = hole->to_written = 0; - - bitmap_clear(splice_rcvlowat_set[0], hole - ts); - bitmap_clear(splice_rcvlowat_set[1], hole - ts); - bitmap_clear(splice_rcvlowat_act[0], hole - ts); - bitmap_clear(splice_rcvlowat_act[1], hole - ts); - - if ((hole - ts) == --c->tcp.splice_conn_count) - return; - - move = &ts[c->tcp.splice_conn_count]; - if (move->state == CLOSED) - return; - - memcpy(hole, move, sizeof(*hole)); - move->state = CLOSED; - move = hole; - - ref_from.r.s = move->from; - ref_from.r.p.tcp.tcp.v6 = move->v6; - ref_to.r.s = move->to; - ref_to.r.p.tcp.tcp.v6 = move->v6; - - if (move->state == SPLICE_ACCEPTED) { - ev_from.events = ev_to.events = 0; - } else if (move->state == SPLICE_CONNECT) { - ev_from.events = 0; - ev_to.events = EPOLLOUT; - } else { - ev_from.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP; - ev_to.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP; - } + /* Simply ignore out-of-order ACKs: we already consumed the data we + * needed from the buffer, and we won't rewind back to a lower ACK + * sequence. + */ + if (SEQ_LE(ack_seq, conn->seq_ack_from_tap)) + return 0; - ev_from.data.u64 = ref_from.u64; - ev_to.data.u64 = ref_to.u64; + if (recv(conn->sock, NULL, ack_seq - conn->seq_ack_from_tap, + MSG_DONTWAIT | MSG_TRUNC) < 0) + return -errno; - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->from, &ev_from); - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move->to, &ev_to); + conn->seq_ack_from_tap = ack_seq; + return 0; } /** - * tcp_splice_destroy() - Close spliced connection and pipes, drop from epoll + * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer * @c: Execution context * @conn: Connection pointer + * @plen: Payload length at L4 + * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer + * @seq: Sequence number to be sent + * @now: Current timestamp */ -static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) +static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, ssize_t plen, + int no_csum, uint32_t seq, struct timespec *now) { - int epoll_del_done = 0; - - switch (conn->state) { - case CLOSED: - epoll_del_done = 1; - /* Falls through */ - case SPLICE_FIN_BOTH: - case SPLICE_FIN_FROM: - case SPLICE_FIN_TO: - case SPLICE_ESTABLISHED: - /* Flushing might need to block: don't recycle them. */ - if (conn->pipe_from_to[0] != -1) { - close(conn->pipe_from_to[0]); - conn->pipe_from_to[0] = -1; - close(conn->pipe_from_to[1]); - conn->pipe_from_to[1] = -1; - } - if (conn->pipe_to_from[0] != -1) { - close(conn->pipe_to_from[0]); - conn->pipe_to_from[0] = -1; - close(conn->pipe_to_from[1]); - conn->pipe_to_from[1] = -1; - } - /* Falls through */ - case SPLICE_CONNECT: - if (!epoll_del_done) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->from, NULL); - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->to, NULL); - } - close(conn->to); - /* Falls through */ - case SPLICE_ACCEPTED: - close(conn->from); - tcp_splice_state(conn, CLOSED); - tcp_table_splice_compact(c, conn); - break; - default: - return; - } -} + struct iovec *iov; + size_t len; -/** - * tcp_sock_consume() - Consume (discard) data from buffer, update ACK sequence - * @conn: Connection pointer - * @ack_seq: ACK sequence, host order - */ -static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq) -{ - /* Simply ignore out-of-order ACKs: we already consumed the data we - * needed from the buffer, and we won't rewind back to a lower ACK - * sequence. - */ - if (SEQ_LE(ack_seq, conn->seq_ack_from_tap)) - return; + if (CONN_V4(conn)) { + struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; + uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; - recv(conn->sock, NULL, ack_seq - conn->seq_ack_from_tap, - MSG_DONTWAIT | MSG_TRUNC); + len = tcp_l2_buf_fill_headers(c, conn, b, plen, check, seq); - conn->seq_ack_from_tap = ack_seq; + iov = tcp4_l2_iov + tcp4_l2_buf_used++; + tcp4_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len); + if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) + tcp_l2_data_buf_flush(c, now); + } else if (CONN_V6(conn)) { + struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; + + len = tcp_l2_buf_fill_headers(c, conn, b, plen, NULL, seq); + + iov = tcp6_l2_iov + tcp6_l2_buf_used++; + tcp6_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len); + if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) + tcp_l2_data_buf_flush(c, now); + } } /** @@ -2183,12 +2201,11 @@ static void tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq) * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, +static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn, struct timespec *now) { int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, plen, v4 = CONN_V4(conn); - uint32_t seq_to_tap = conn->seq_to_tap; int s = conn->sock, i, ret = 0; struct msghdr mh_sock = { 0 }; uint32_t already_sent; @@ -2198,23 +2215,24 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, if (SEQ_LT(already_sent, 0)) { /* RFC 761, section 2.1. */ - seq_to_tap = conn->seq_to_tap = conn->seq_ack_from_tap; + conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; } if (!conn->wnd_from_tap || already_sent >= conn->wnd_from_tap) { - tcp_tap_epoll_mask(c, conn, conn->events | EPOLLET); + conn_flag(c, conn, CONN_STALLED); conn->tap_data_noack = *now; return 0; } + /* Set up buffer descriptors we'll fill completely and partially. */ fill_bufs = DIV_ROUND_UP(conn->wnd_from_tap - already_sent, - conn->mss_guest); - if (fill_bufs > TCP_TAP_FRAMES) { - fill_bufs = TCP_TAP_FRAMES; + conn->tap_mss); + if (fill_bufs > TCP_FRAMES) { + fill_bufs = TCP_FRAMES; iov_rem = 0; } else { - iov_rem = (conn->wnd_from_tap - already_sent) % conn->mss_guest; + iov_rem = (conn->wnd_from_tap - already_sent) % conn->tap_mss; } mh_sock.msg_iov = iov_sock; @@ -2225,19 +2243,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) - tcp_l2_buf_flush(c); + tcp_l2_data_buf_flush(c, now); for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; else iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; - iov->iov_len = conn->mss_guest; + iov->iov_len = conn->tap_mss; } if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; - /* Don't dequeue until acknowledged by guest. */ + /* Receive into buffers, don't dequeue until acknowledged by guest. */ recvmsg: len = recvmsg(s, &mh_sock, MSG_PEEK); if (len < 0) { @@ -2251,117 +2269,57 @@ recvmsg: sendlen = len - already_sent; if (sendlen <= 0) { - tcp_tap_epoll_mask(c, conn, conn->events | EPOLLET); + conn_flag(c, conn, CONN_STALLED); return 0; } - tcp_tap_epoll_mask(c, conn, conn->events & ~EPOLLET); + conn_flag(c, conn, ~CONN_STALLED); - send_bufs = DIV_ROUND_UP(sendlen, conn->mss_guest); - last_len = sendlen - (send_bufs - 1) * conn->mss_guest; + send_bufs = DIV_ROUND_UP(sendlen, conn->tap_mss); + last_len = sendlen - (send_bufs - 1) * conn->tap_mss; /* Likely, some new data was acked too. */ tcp_update_seqack_wnd(c, conn, 0, NULL); - plen = conn->mss_guest; + /* Finally, queue to tap */ + plen = conn->tap_mss; for (i = 0; i < send_bufs; i++) { - ssize_t eth_len; + int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; if (i == send_bufs - 1) plen = last_len; - if (v4) { - struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; - uint16_t *check = NULL; - - if (i && i != send_bufs - 1 && tcp4_l2_buf_used) - check = &(b - 1)->iph.check; - - eth_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - check, seq_to_tap); - - if (c->mode == MODE_PASST) { - iov = tcp4_l2_iov_tap + tcp4_l2_buf_used++; - iov->iov_len = eth_len + sizeof(uint32_t); - tcp4_l2_buf_bytes += iov->iov_len; - - if (tcp4_l2_buf_used > - ARRAY_SIZE(tcp4_l2_buf) - 1) - tcp_l2_buf_flush(c); - - seq_to_tap += plen; - continue; - } - - pcap((char *)&b->eh, eth_len); - ret = write(c->fd_tap, &b->eh, eth_len); - } else { - struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; - - eth_len = tcp_l2_buf_fill_headers(c, conn, b, plen, - NULL, seq_to_tap); - - if (c->mode == MODE_PASST) { - iov = tcp6_l2_iov_tap + tcp6_l2_buf_used++; - iov->iov_len = eth_len + sizeof(uint32_t); - tcp6_l2_buf_bytes += iov->iov_len; - - if (tcp6_l2_buf_used > - ARRAY_SIZE(tcp6_l2_buf) - 1) - tcp_l2_buf_flush(c); - - seq_to_tap += plen; - continue; - } - - pcap((char *)&b->eh, eth_len); - ret = write(c->fd_tap, &b->eh, eth_len); - } - - if (ret < eth_len) { - if (ret < 0) { - if (errno == EAGAIN || errno == EWOULDBLOCK) - return 0; - - tap_handler(c, c->fd_tap, EPOLLERR, now); - } - - i--; - continue; - } - + tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap, now); conn->seq_to_tap += plen; } - if (c->mode == MODE_PASTA) - return ret; - - conn->tap_data_noack = *now; - conn->seq_to_tap += conn->mss_guest * (send_bufs - 1) + last_len; - - conn->ts_ack_to_tap = *now; + conn->tap_data_noack = conn->ts_ack_to_tap = *now; return 0; err: if (errno != EAGAIN && errno != EWOULDBLOCK) { - tcp_rst(c, conn); ret = -errno; + tcp_rst(c, conn); } + return ret; zero_len: - if (conn->state == ESTABLISHED_SOCK_FIN) { - tcp_tap_epoll_mask(c, conn, EPOLLET); - tcp_send_to_tap(c, conn, FIN | ACK, now); - tcp_tap_state(conn, ESTABLISHED_SOCK_FIN_SENT); + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + if ((ret = tcp_send_flag(c, conn, FIN | ACK, now))) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); } return 0; } /** - * tcp_data_from_tap() - tap data in ESTABLISHED{,SOCK_FIN}, CLOSE_WAIT states + * tcp_data_from_tap() - tap data for established connection * @c: Execution context * @conn: Connection pointer * @msg: Array of messages from tap @@ -2370,15 +2328,15 @@ zero_len: * * #syscalls sendmsg */ -static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, struct tap_l4_msg *msg, int count, struct timespec *now) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1; - struct msghdr mh = { .msg_iov = tcp_tap_iov }; uint32_t max_ack_seq = conn->seq_ack_from_tap; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; uint32_t seq_from_tap = conn->seq_from_tap; + struct msghdr mh = { .msg_iov = tcp_iov }; int partial_send = 0; uint16_t len; ssize_t n; @@ -2404,7 +2362,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, } if (th->rst) { - tcp_tap_destroy(c, conn); + tcp_conn_destroy(c, conn); return; } @@ -2467,9 +2425,9 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, continue; } - tcp_tap_iov[iov_i].iov_base = data + seq_offset; - tcp_tap_iov[iov_i].iov_len = len - seq_offset; - seq_from_tap += tcp_tap_iov[iov_i].iov_len; + tcp_iov[iov_i].iov_base = data + seq_offset; + tcp_iov[iov_i].iov_len = len - seq_offset; + seq_from_tap += tcp_iov[iov_i].iov_len; iov_i++; if (keep == i) @@ -2479,7 +2437,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, i = keep - 1; } - tcp_clamp_window(conn, NULL, 0, max_ack_seq_wnd, 0); + tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0); if (ack) { conn->ts_ack_from_tap = *now; @@ -2507,25 +2465,24 @@ eintr: * Then swiftly looked away and left. */ conn->seq_from_tap = seq_from_tap; - tcp_send_to_tap(c, conn, FORCE_ACK, now); + tcp_send_flag(c, conn, ACK, now); } if (errno == EINTR) goto eintr; if (errno == EAGAIN || errno == EWOULDBLOCK) { - tcp_send_to_tap(c, conn, 0, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED, now); return; } tcp_rst(c, conn); return; } - if (n < (int)(seq_from_tap - conn->seq_from_tap)) { partial_send = 1; conn->seq_from_tap += n; - tcp_send_to_tap(c, conn, 0, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED, now); } else { conn->seq_from_tap += n; } @@ -2534,34 +2491,52 @@ out: if (keep != -1) { if (conn->seq_dup_ack != conn->seq_from_tap) { conn->seq_dup_ack = conn->seq_from_tap; - tcp_send_to_tap(c, conn, DUP_ACK, now); + tcp_send_flag(c, conn, DUP_ACK, now); } return; } - if (ack) { - if (conn->state == ESTABLISHED_SOCK_FIN_SENT && - conn->seq_ack_from_tap == conn->seq_to_tap) - tcp_tap_state(conn, CLOSE_WAIT); - } + if (ack && conn->events & TAP_FIN_SENT && + conn->seq_ack_from_tap == conn->seq_to_tap) + conn_event(c, conn, TAP_FIN_ACKED); if (fin && !partial_send) { conn->seq_from_tap++; - if (conn->state == ESTABLISHED) { - shutdown(conn->sock, SHUT_WR); - tcp_tap_state(conn, FIN_WAIT_1); - tcp_send_to_tap(c, conn, ACK, now); - } else if (conn->state == CLOSE_WAIT) { - shutdown(conn->sock, SHUT_WR); - tcp_tap_state(conn, LAST_ACK); - tcp_send_to_tap(c, conn, ACK, now); - } + conn_event(c, conn, TAP_FIN_RCVD); } else { - tcp_send_to_tap(c, conn, 0, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED, now); } } +/** + * tcp_conn_from_sock_finish() - Complete connection setup after connect() + * @c: Execution context + * @conn: Connection pointer + * @th: TCP header of SYN, ACK segment from tap/guest + * @len: Packet length of SYN, ACK segment at L4, host order + * @now: Current timestamp + */ +static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, + struct tcphdr *th, size_t len, + struct timespec *now) +{ + tcp_clamp_window(c, conn, th, len, 0, 1); + conn->tap_mss = tcp_conn_tap_mss(c, conn, th, len); + + conn->seq_init_from_tap = ntohl(th->seq) + 1; + conn->seq_from_tap = conn->seq_init_from_tap; + conn->seq_ack_to_tap = conn->seq_from_tap; + + conn_event(c, conn, ESTABLISHED); + + /* The client might have sent data already, which we didn't + * dequeue waiting for SYN,ACK from tap -- check now. + */ + tcp_data_from_sock(c, conn, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED, now); +} + /** * tcp_tap_handler() - Handle packets from tap and state transitions * @c: Execution context @@ -2578,10 +2553,11 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, { struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset); uint16_t len = msg[0].l4_len; - struct tcp_tap_conn *conn; - int mss; + struct tcp_conn *conn; conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest)); + + /* New connection from tap */ if (!conn) { if (th->syn && !th->ack) tcp_conn_from_tap(c, af, addr, th, len, now); @@ -2589,59 +2565,40 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, } if (th->rst) { - tcp_tap_destroy(c, conn); + tcp_conn_destroy(c, conn); return count; } conn->ts_tap_act = *now; + conn_flag(c, conn, ~CONN_STALLED); - switch (conn->state) { - case SOCK_SYN_SENT: - if (!th->syn || !th->ack) { + /* Establishing connection from socket */ + if (conn->events & SOCK_ACCEPTED) { + if (th->syn && th->ack && !th->fin) + tcp_conn_from_sock_finish(c, conn, th, len, now); + else tcp_rst(c, conn); - return count; - } - tcp_clamp_window(conn, th, len, 0, 1); - - if ((mss = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0) - conn->mss_guest = MSS_DEFAULT; - else - conn->mss_guest = mss; + return 1; + } - /* Don't upset qemu */ - if (c->mode == MODE_PASST) { - if (af == AF_INET) - conn->mss_guest = MIN(MSS4, conn->mss_guest); - else - conn->mss_guest = MIN(MSS6, conn->mss_guest); + /* Establishing connection from tap */ + if (conn->events & TAP_SYN_RCVD) { + if (!(conn->events & TAP_SYN_ACK_SENT)) { + tcp_rst(c, conn); + return count; } - /* tinfo.tcpi_bytes_acked already includes one byte for SYN, but - * not for incoming connections. - */ - conn->seq_init_from_tap = ntohl(th->seq) + 1; - conn->seq_from_tap = conn->seq_init_from_tap; - conn->seq_ack_to_tap = conn->seq_from_tap; - - tcp_tap_state(conn, ESTABLISHED); + conn_event(c, conn, ESTABLISHED); - /* The client might have sent data already, which we didn't - * dequeue waiting for SYN,ACK from tap -- check now. - */ - tcp_data_from_sock(c, conn, now); - tcp_send_to_tap(c, conn, 0, now); - - tcp_tap_epoll_mask(c, conn, EPOLLIN | EPOLLRDHUP); - break; - case TAP_SYN_RCVD: if (th->fin) { conn->seq_from_tap++; shutdown(conn->sock, SHUT_WR); - tcp_send_to_tap(c, conn, ACK, now); - tcp_tap_state(conn, FIN_WAIT_1); - break; + tcp_send_flag(c, conn, ACK, now); + conn_event(c, conn, SOCK_FIN_SENT); + + return count; } if (!th->ack) { @@ -2649,275 +2606,62 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, return count; } - tcp_clamp_window(conn, th, len, 0, 0); + tcp_clamp_window(c, conn, th, len, 0, 0); - tcp_tap_state(conn, ESTABLISHED); if (count == 1) - break; + return 1; + } - /* Falls through */ - case ESTABLISHED: - case ESTABLISHED_SOCK_FIN: - case ESTABLISHED_SOCK_FIN_SENT: - tcp_tap_epoll_mask(c, conn, conn->events & ~EPOLLET); - tcp_data_from_tap(c, conn, msg, count, now); - return count; - case CLOSE_WAIT: - case FIN_WAIT_1_SOCK_FIN: - case FIN_WAIT_1: + tcp_sock_consume(conn, ntohl(th->ack_seq)); + + /* Established connections not accepting data from tap */ + if (conn->events & TAP_FIN_RCVD) { if (th->ack) { conn->tap_data_noack = ((struct timespec) { 0, 0 }); conn->ts_ack_from_tap = *now; } - tcp_sock_consume(conn, ntohl(th->ack_seq)); - if (conn->state == FIN_WAIT_1_SOCK_FIN && - conn->seq_ack_from_tap == conn->seq_to_tap) { - tcp_tap_destroy(c, conn); - return count; - } + if (conn->events & SOCK_FIN_RCVD && + conn->seq_ack_from_tap == conn->seq_to_tap) + tcp_conn_destroy(c, conn); - tcp_tap_epoll_mask(c, conn, conn->events & ~EPOLLET); - return count; - case TAP_SYN_SENT: - case LAST_ACK: - case SPLICE_ACCEPTED: - case SPLICE_CONNECT: - case SPLICE_ESTABLISHED: - case SPLICE_FIN_FROM: - case SPLICE_FIN_TO: - case SPLICE_FIN_BOTH: - case CLOSED: /* ;) */ - break; - } - - return 1; + return 1; + } + + /* Established connections accepting data from tap */ + tcp_data_from_tap(c, conn, msg, count, now); + + if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) { + shutdown(conn->sock, SHUT_WR); + conn_event(c, conn, SOCK_FIN_SENT); + tcp_send_flag(c, conn, ACK, now); + } + + return count; } /** * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event * @c: Execution context - * @s: File descriptor number for socket + * @conn: Connection pointer * @now: Current timestamp */ -static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn, struct timespec *now) { socklen_t sl; int so; - /* Drop EPOLLOUT, only used to wait for connect() to complete */ - tcp_tap_epoll_mask(c, conn, EPOLLIN | EPOLLRDHUP); - sl = sizeof(so); if (getsockopt(conn->sock, SOL_SOCKET, SO_ERROR, &so, &sl) || so) { tcp_rst(c, conn); return; } - if (tcp_send_to_tap(c, conn, SYN | ACK, now)) + if (tcp_send_flag(c, conn, SYN | ACK, now)) return; - tcp_tap_state(conn, TAP_SYN_RCVD); -} - -/** - * tcp_splice_connect_finish() - Completion of connect() or call on success - * @c: Execution context - * @conn: Connection pointer - * @v6: Set on IPv6 connection - */ -static void tcp_splice_connect_finish(struct ctx *c, - struct tcp_splice_conn *conn, int v6) -{ - union epoll_ref ref_from = { .r.proto = IPPROTO_TCP, .r.s = conn->from, - .r.p.tcp.tcp = { .splice = 1, .v6 = v6, - .index = conn - ts } }; - union epoll_ref ref_to = { .r.proto = IPPROTO_TCP, .r.s = conn->to, - .r.p.tcp.tcp = { .splice = 1, .v6 = v6, - .index = conn - ts } }; - struct epoll_event ev_from, ev_to; - int i; - - conn->pipe_from_to[0] = conn->pipe_to_from[0] = -1; - conn->pipe_from_to[1] = conn->pipe_to_from[1] = -1; - for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { - if (splice_pipe_pool[i][0][0] > 0) { - SWAP(conn->pipe_from_to[0], splice_pipe_pool[i][0][0]); - SWAP(conn->pipe_from_to[1], splice_pipe_pool[i][0][1]); - - SWAP(conn->pipe_to_from[0], splice_pipe_pool[i][1][0]); - SWAP(conn->pipe_to_from[1], splice_pipe_pool[i][1][1]); - break; - } - } - - if (conn->pipe_from_to[0] < 0) { - if (pipe2(conn->pipe_to_from, O_NONBLOCK) || - pipe2(conn->pipe_from_to, O_NONBLOCK)) { - tcp_splice_destroy(c, conn); - return; - } - - fcntl(conn->pipe_from_to[0], F_SETPIPE_SZ, c->tcp.pipe_size); - fcntl(conn->pipe_to_from[0], F_SETPIPE_SZ, c->tcp.pipe_size); - } - - if (conn->state == SPLICE_CONNECT) { - tcp_splice_state(conn, SPLICE_ESTABLISHED); - - ev_from.events = ev_to.events = EPOLLIN | EPOLLRDHUP; - ev_from.data.u64 = ref_from.u64; - ev_to.data.u64 = ref_to.u64; - - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->from, &ev_from); - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->to, &ev_to); - } -} - -/** - * tcp_splice_connect() - Create and connect socket for new spliced connection - * @c: Execution context - * @conn: Connection pointer - * @v6: Set on IPv6 connection - * @port: Destination port, host order - * - * Return: 0 for connect() succeeded or in progress, negative value on error - */ -static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn, - int s, int v6, in_port_t port) -{ - int sock_conn = (s >= 0) ? s : socket(v6 ? AF_INET6 : AF_INET, - SOCK_STREAM | SOCK_NONBLOCK, - IPPROTO_TCP); - union epoll_ref ref_accept = { .r.proto = IPPROTO_TCP, - .r.s = conn->from, - .r.p.tcp.tcp = { .splice = 1, .v6 = v6, - .index = conn - ts } }; - union epoll_ref ref_conn = { .r.proto = IPPROTO_TCP, .r.s = sock_conn, - .r.p.tcp.tcp = { .splice = 1, .v6 = v6, - .index = conn - ts } }; - struct epoll_event ev_accept = { .data.u64 = ref_accept.u64 }; - struct epoll_event ev_conn = { .data.u64 = ref_conn.u64 }; - struct sockaddr_in6 addr6 = { - .sin6_family = AF_INET6, - .sin6_port = htons(port), - .sin6_addr = IN6ADDR_LOOPBACK_INIT, - }; - struct sockaddr_in addr4 = { - .sin_family = AF_INET, - .sin_port = htons(port), - .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) }, - }; - const struct sockaddr *sa; - socklen_t sl; - int one = 1; - - conn->to = sock_conn; - - if (s < 0) - tcp_sock_set_bufsize(c, conn->to); - - setsockopt(conn->to, SOL_TCP, TCP_QUICKACK, &one, sizeof(one)); - - if (v6) { - sa = (struct sockaddr *)&addr6; - sl = sizeof(addr6); - } else { - sa = (struct sockaddr *)&addr4; - sl = sizeof(addr4); - } - - if (connect(conn->to, sa, sl)) { - if (errno != EINPROGRESS) { - int ret = -errno; - - close(sock_conn); - return ret; - } - - tcp_splice_state(conn, SPLICE_CONNECT); - ev_conn.events = EPOLLOUT; - } else { - tcp_splice_state(conn, SPLICE_ESTABLISHED); - tcp_splice_connect_finish(c, conn, v6); - - ev_accept.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP; - ev_conn.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP; - - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->from, &ev_accept); - } - - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->to, &ev_conn); - - return 0; -} - -/** - * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns() - * @c: Execution context - * @conn: Accepted inbound connection - * @v6: Set for inbound IPv6 connection - * @port: Destination port, host order - * @ret: Return value of tcp_splice_connect_ns() - */ -struct tcp_splice_connect_ns_arg { - struct ctx *c; - struct tcp_splice_conn *conn; - int v6; - in_port_t port; - int ret; -}; - -/** - * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect() - * @arg: See struct tcp_splice_connect_ns_arg - * - * Return: 0 - */ -static int tcp_splice_connect_ns(void *arg) -{ - struct tcp_splice_connect_ns_arg *a; - - a = (struct tcp_splice_connect_ns_arg *)arg; - ns_enter(a->c); - a->ret = tcp_splice_connect(a->c, a->conn, -1, a->v6, a->port); - return 0; -} - -/** - * tcp_splice_new() - Handle new inbound, spliced connection - * @c: Execution context - * @conn: Connection pointer - * @v6: Set for IPv6 connection - * @port: Destination port, host order - * - * Return: return code from connect() - */ -static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn, - int v6, in_port_t port) -{ - struct tcp_splice_connect_ns_arg ns_arg = { c, conn, v6, port, 0 }; - int *sock_pool_p, i, s = -1; - - if (bitmap_isset(c->tcp.port_to_tap, port)) - sock_pool_p = v6 ? ns_sock_pool6 : ns_sock_pool4; - else - sock_pool_p = v6 ? init_sock_pool6 : init_sock_pool4; - - for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, sock_pool_p++) { - if ((s = *sock_pool_p) >= 0) { - *sock_pool_p = -1; - break; - } - } - - if (s < 0 && bitmap_isset(c->tcp.port_to_tap, port)) { - NS_CALL(tcp_splice_connect_ns, &ns_arg); - return ns_arg.ret; - } - - return tcp_splice_connect(c, conn, s, v6, port); + conn_event(c, conn, TAP_SYN_ACK_SENT); } /** @@ -2929,15 +2673,12 @@ static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn, static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, struct timespec *now) { - union epoll_ref ref_conn = { .r.proto = IPPROTO_TCP, - .r.p.tcp.tcp.v6 = ref.r.p.tcp.tcp.v6 }; struct sockaddr_storage sa; - struct tcp_tap_conn *conn; - struct epoll_event ev; + struct tcp_conn *conn; socklen_t sl; int s; - if (c->tcp.tap_conn_count >= MAX_TAP_CONNS) + if (c->tcp.conn_count >= TCP_MAX_CONNS) return; sl = sizeof(sa); @@ -2945,9 +2686,10 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, if (s < 0) return; - conn = &tt[c->tcp.tap_conn_count++]; - ref_conn.r.p.tcp.tcp.index = conn - tt; - ref_conn.r.s = conn->sock = s; + conn = CONN(c->tcp.conn_count++); + conn->sock = s; + + conn_event(c, conn, SOCK_ACCEPTED); if (ref.r.p.tcp.tcp.v6) { struct sockaddr_in6 sa6; @@ -3015,266 +2757,11 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->ts_sock_act = conn->ts_tap_act = *now; conn->ts_ack_from_tap = conn->ts_ack_to_tap = *now; - tcp_send_to_tap(c, conn, SYN, now); - - conn->events = ev.events = EPOLLRDHUP; - ev.data.u64 = ref_conn.u64; - epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->sock, &ev); - - tcp_tap_state(conn, SOCK_SYN_SENT); + tcp_send_flag(c, conn, SYN, now); tcp_get_sndbuf(conn); } -/** - * tcp_sock_handler_splice() - Handler for socket mapped to spliced connection - * @c: Execution context - * @ref: epoll reference - * @events: epoll events bitmap - * - * #syscalls:pasta splice - */ -void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, - uint32_t events) -{ - int move_from, move_to, *pipes, eof, never_read; - uint8_t *rcvlowat_set, *rcvlowat_act; - uint64_t *seq_read, *seq_write; - struct tcp_splice_conn *conn; - struct epoll_event ev; - - if (ref.r.p.tcp.tcp.listen) { - int s, one = 1; - - if (c->tcp.splice_conn_count >= MAX_SPLICE_CONNS) - return; - - if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0) - return; - - setsockopt(s, SOL_TCP, TCP_QUICKACK, &one, sizeof(one)); - - conn = &ts[c->tcp.splice_conn_count++]; - conn->from = s; - tcp_splice_state(conn, SPLICE_ACCEPTED); - - if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.v6, - ref.r.p.tcp.tcp.index)) - tcp_splice_destroy(c, conn); - - return; - } - - conn = &ts[ref.r.p.tcp.tcp.index]; - - if (events & EPOLLERR) - goto close; - - if (conn->state == SPLICE_CONNECT && (events & EPOLLHUP)) - goto close; - - if (events & EPOLLOUT) { - ev.events = EPOLLIN | EPOLLRDHUP; - ev.data.u64 = ref.u64; - - if (conn->state == SPLICE_CONNECT) - tcp_splice_connect_finish(c, conn, ref.r.p.tcp.tcp.v6); - else if (conn->state == SPLICE_ESTABLISHED) - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, ref.r.s, &ev); - - move_to = ref.r.s; - if (ref.r.s == conn->to) { - move_from = conn->from; - pipes = conn->pipe_from_to; - } else { - move_from = conn->to; - pipes = conn->pipe_to_from; - } - } else { - move_from = ref.r.s; - if (ref.r.s == conn->from) { - move_to = conn->to; - pipes = conn->pipe_from_to; - } else { - move_to = conn->from; - pipes = conn->pipe_to_from; - } - } - - if (events & EPOLLRDHUP) { - if (ref.r.s == conn->from) { - if (conn->state == SPLICE_ESTABLISHED) - tcp_splice_state(conn, SPLICE_FIN_FROM); - else if (conn->state == SPLICE_FIN_TO) - tcp_splice_state(conn, SPLICE_FIN_BOTH); - } else { - if (conn->state == SPLICE_ESTABLISHED) - tcp_splice_state(conn, SPLICE_FIN_TO); - else if (conn->state == SPLICE_FIN_FROM) - tcp_splice_state(conn, SPLICE_FIN_BOTH); - } - } - -swap: - eof = 0; - never_read = 1; - - if (move_from == conn->from) { - seq_read = &conn->from_read; - seq_write = &conn->from_written; - rcvlowat_set = splice_rcvlowat_set[0]; - rcvlowat_act = splice_rcvlowat_act[0]; - } else { - seq_read = &conn->to_read; - seq_write = &conn->to_written; - rcvlowat_set = splice_rcvlowat_set[1]; - rcvlowat_act = splice_rcvlowat_act[1]; - } - - - while (1) { - int retry_write = 0, more = 0; - ssize_t readlen, to_write = 0, written; - -retry: - readlen = splice(move_from, NULL, pipes[1], NULL, - c->tcp.pipe_size, - SPLICE_F_MOVE | SPLICE_F_NONBLOCK); - if (readlen < 0) { - if (errno == EINTR) - goto retry; - - if (errno != EAGAIN) - goto close; - - to_write = c->tcp.pipe_size; - } else if (!readlen) { - eof = 1; - to_write = c->tcp.pipe_size; - } else { - never_read = 0; - to_write += readlen; - if (readlen >= (long)c->tcp.pipe_size * 90 / 100) - more = SPLICE_F_MORE; - - if (bitmap_isset(rcvlowat_set, conn - ts)) - bitmap_set(rcvlowat_act, conn - ts); - } - -eintr: - written = splice(pipes[0], NULL, move_to, NULL, to_write, - SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); - - /* Most common case: skip updating counters. */ - if (readlen > 0 && readlen == written) { - if (readlen >= (long)c->tcp.pipe_size * 10 / 100) - continue; - - if (!bitmap_isset(rcvlowat_set, conn - ts) && - readlen > (long)c->tcp.pipe_size / 10) { - int lowat = c->tcp.pipe_size / 4; - - setsockopt(move_from, SOL_SOCKET, SO_RCVLOWAT, - &lowat, sizeof(lowat)); - - bitmap_set(rcvlowat_set, conn - ts); - bitmap_set(rcvlowat_act, conn - ts); - } - - break; - } - - *seq_read += readlen > 0 ? readlen : 0; - *seq_write += written > 0 ? written : 0; - - if (written < 0) { - if (errno == EINTR) - goto eintr; - - if (errno != EAGAIN) - goto close; - - if (never_read) - break; - - if (retry_write--) - goto retry; - - ev.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP; - ref.r.s = move_to; - ev.data.u64 = ref.u64, - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, move_to, &ev); - break; - } - - if (never_read && written == (long)(c->tcp.pipe_size)) - goto retry; - - if (!never_read && written < to_write) { - to_write -= written; - goto retry; - } - - if (eof) - break; - } - - if (*seq_read == *seq_write) { - if (move_from == conn->from && - (conn->state == SPLICE_FIN_FROM || - conn->state == SPLICE_FIN_BOTH)) { - if (!conn->from_fin_sent) { - shutdown(conn->to, SHUT_WR); - conn->from_fin_sent = 1; - - ev.events = 0; - ref.r.s = move_from; - ev.data.u64 = ref.u64, - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, - move_from, &ev); - } - - if (conn->to_fin_sent) - goto close; - } else if (move_from == conn->to && - (conn->state == SPLICE_FIN_TO || - conn->state == SPLICE_FIN_BOTH)) { - if (!conn->to_fin_sent) { - shutdown(conn->from, SHUT_WR); - conn->to_fin_sent = 1; - - ev.events = 0; - ref.r.s = move_from; - ev.data.u64 = ref.u64, - epoll_ctl(c->epollfd, EPOLL_CTL_MOD, - move_from, &ev); - } - - if (conn->from_fin_sent) - goto close; - } - } - - if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) { - events = EPOLLIN; - - SWAP(move_from, move_to); - if (pipes == conn->pipe_from_to) - pipes = conn->pipe_to_from; - else - pipes = conn->pipe_from_to; - - goto swap; - } - - return; - -close: - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->from, NULL); - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->to, NULL); - conn->state = CLOSED; -} - /** * tcp_sock_handler() - Handle new data from socket * @c: Execution context @@ -3285,7 +2772,7 @@ close: void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now) { - struct tcp_tap_conn *conn; + struct tcp_conn *conn; if (ref.r.p.tcp.tcp.splice) { tcp_sock_handler_splice(c, ref, events); @@ -3297,110 +2784,52 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, return; } - conn = &tt[ref.r.p.tcp.tcp.index]; + if (!(conn = CONN(ref.r.p.tcp.tcp.index))) + return; conn->ts_sock_act = *now; if (events & EPOLLERR) { - if (conn->state != CLOSED) - tcp_rst(c, conn); - + tcp_rst(c, conn); return; } - switch (conn->state) { - case TAP_SYN_SENT: - if (events & EPOLLOUT) - tcp_connect_finish(c, conn, now); - else - tcp_rst(c, conn); - return; - case ESTABLISHED_SOCK_FIN: - case ESTABLISHED_SOCK_FIN_SENT: - case ESTABLISHED: - if (events & EPOLLRDHUP) { - if (conn->state == ESTABLISHED) - tcp_tap_state(conn, ESTABLISHED_SOCK_FIN); - } - tcp_data_from_sock(c, conn, now); - return; - case LAST_ACK: - tcp_send_to_tap(c, conn, 0, now); - if (conn->seq_ack_to_tap == conn->seq_from_tap + 1 || - conn->seq_ack_to_tap == conn->seq_from_tap) - tcp_tap_destroy(c, conn); - return; - case FIN_WAIT_1: - if (events & EPOLLIN) - tcp_data_from_sock(c, conn, now); - if (events & EPOLLRDHUP) { - tcp_send_to_tap(c, conn, FIN | ACK, now); - tcp_tap_state(conn, FIN_WAIT_1_SOCK_FIN); - } - return; - case CLOSE_WAIT: - case FIN_WAIT_1_SOCK_FIN: - if (events & EPOLLIN) - tcp_data_from_sock(c, conn, now); - if (events & EPOLLHUP) { - if ((conn->seq_ack_to_tap == conn->seq_from_tap + 1 || - conn->seq_ack_to_tap == conn->seq_from_tap) && - (conn->seq_ack_from_tap == conn->seq_to_tap - 1 || - conn->seq_ack_from_tap == conn->seq_to_tap)) { - tcp_tap_destroy(c, conn); - } else { - tcp_send_to_tap(c, conn, ACK, now); - } - } + if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) { + tcp_conn_destroy(c, conn); return; - case TAP_SYN_RCVD: - case SOCK_SYN_SENT: - case SPLICE_ACCEPTED: - case SPLICE_CONNECT: - case SPLICE_ESTABLISHED: - case SPLICE_FIN_FROM: - case SPLICE_FIN_TO: - case SPLICE_FIN_BOTH: - case CLOSED: - break; } -} -/** - * tcp_set_pipe_size() - Set usable pipe size, probe starting from MAX_PIPE_SIZE - * @c: Execution context - */ -static void tcp_set_pipe_size(struct ctx *c) -{ - int probe_pipe[TCP_SPLICE_PIPE_POOL_SIZE * 2][2], i, j; + if (conn->events & ESTABLISHED) { + if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) + tcp_conn_destroy(c, conn); - c->tcp.pipe_size = MAX_PIPE_SIZE; + if (events & (EPOLLRDHUP | EPOLLHUP)) + conn_event(c, conn, SOCK_FIN_RCVD); -smaller: - for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE * 2; i++) { - if (pipe2(probe_pipe[i], 0)) { - i++; - break; - } + if (events & EPOLLIN) + tcp_data_from_sock(c, conn, now); - if (fcntl(probe_pipe[i][0], F_SETPIPE_SZ, c->tcp.pipe_size) < 0) - break; - } + if (events & EPOLLOUT) + tcp_update_seqack_wnd(c, conn, 0, NULL); - for (j = i - 1; j >= 0; j--) { - close(probe_pipe[j][0]); - close(probe_pipe[j][1]); + return; } - if (i == TCP_SPLICE_PIPE_POOL_SIZE * 2) + /* EPOLLHUP during handshake: reset */ + if (events & EPOLLHUP) { + tcp_rst(c, conn); return; + } - if (!(c->tcp.pipe_size /= 2)) { - c->tcp.pipe_size = MAX_PIPE_SIZE; + /* Data during handshake tap-side: check later */ + if (conn->events & SOCK_ACCEPTED) return; - } - goto smaller; + if (conn->events == TAP_SYN_RCVD) { + if (events & EPOLLOUT) + tcp_connect_finish(c, conn, now); + /* Data? Check later */ + } } /** @@ -3516,32 +2945,6 @@ static int tcp_sock_init_ns(void *arg) return 0; } -/** - * tcp_splice_pipe_refill() - Refill pool of pre-opened pipes - * @c: Execution context - */ -static void tcp_splice_pipe_refill(struct ctx *c) -{ - int i; - - for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { - if (splice_pipe_pool[i][0][0] >= 0) - break; - if (pipe2(splice_pipe_pool[i][0], O_NONBLOCK)) - continue; - if (pipe2(splice_pipe_pool[i][1], O_NONBLOCK)) { - close(splice_pipe_pool[i][1][0]); - close(splice_pipe_pool[i][1][1]); - continue; - } - - fcntl(splice_pipe_pool[i][0][0], F_SETPIPE_SZ, - c->tcp.pipe_size); - fcntl(splice_pipe_pool[i][1][0], F_SETPIPE_SZ, - c->tcp.pipe_size); - } -} - /** * struct tcp_sock_refill_arg - Arguments for tcp_sock_refill() * @c: Execution context @@ -3637,8 +3040,8 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) tcp_sock_init_one(c, 0, port); } - for (i = 0; i < ARRAY_SIZE(tcp_l2_mh_tap); i++) - tcp_l2_mh_tap[i] = (struct mmsghdr) { .msg_hdr.msg_iovlen = 1 }; + for (i = 0; i < ARRAY_SIZE(tcp_l2_mh); i++) + tcp_l2_mh[i] = (struct mmsghdr) { .msg_hdr.msg_iovlen = 1 }; if (c->v4) tcp_sock4_iov_init(); @@ -3646,7 +3049,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) if (c->v6) tcp_sock6_iov_init(); - memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool)); memset(init_sock_pool4, 0xff, sizeof(init_sock_pool4)); memset(init_sock_pool6, 0xff, sizeof(init_sock_pool6)); memset(ns_sock_pool4, 0xff, sizeof(ns_sock_pool4)); @@ -3659,12 +3061,12 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) tcp_sock_refill(&refill_arg); if (c->mode == MODE_PASTA) { - tcp_set_pipe_size(c); + tcp_splice_init(c); + NS_CALL(tcp_sock_init_ns, c); refill_arg.ns = 1; NS_CALL(tcp_sock_refill, &refill_arg); - tcp_splice_pipe_refill(c); c->tcp.port_detect_ts = *now; } @@ -3678,7 +3080,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) * @conn: Connection pointer * @ts: Timestamp from caller */ -static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn, +static void tcp_timer_one(struct ctx *c, struct tcp_conn *conn, struct timespec *ts) { int ack_from_tap = timespec_diff_ms(ts, &conn->ts_ack_from_tap); @@ -3693,67 +3095,49 @@ static void tcp_timer_one(struct ctx *c, struct tcp_tap_conn *conn, else tap_data_noack = timespec_diff_ms(ts, &conn->tap_data_noack); - switch (conn->state) { - case CLOSED: + if (CONN_IS_CLOSED(conn)) { tcp_hash_remove(conn); - tcp_table_tap_compact(c, conn); - break; - case SOCK_SYN_SENT: - case TAP_SYN_RCVD: + tcp_table_compact(c, conn); + return; + } + + if (!(conn->events & ESTABLISHED)) { if (ack_from_tap > SYN_TIMEOUT) tcp_rst(c, conn); + return; + } - break; - case ESTABLISHED_SOCK_FIN_SENT: - if (tap_data_noack > FIN_TIMEOUT) { - tcp_rst(c, conn); - break; - } - /* Falls through */ - case ESTABLISHED: - case ESTABLISHED_SOCK_FIN: - if (tap_act > ACT_TIMEOUT && sock_act > ACT_TIMEOUT) { - tcp_rst(c, conn); - break; - } + if (tap_act > ACT_TIMEOUT && sock_act > ACT_TIMEOUT) + goto rst; - if (!conn->wnd_to_tap || ack_to_tap > ACK_INTERVAL) - tcp_send_to_tap(c, conn, 0, ts); + if (!conn->wnd_to_tap || ack_to_tap > ACK_INTERVAL) + tcp_send_flag(c, conn, ACK_IF_NEEDED, ts); - if (tap_data_noack > ACK_TIMEOUT) { - if (conn->seq_ack_from_tap < conn->seq_to_tap) { - if (tap_data_noack > LAST_ACK_TIMEOUT) { - tcp_rst(c, conn); - break; - } + if (tap_data_noack > ACK_TIMEOUT) { + if (conn->seq_ack_from_tap < conn->seq_to_tap) { + if (tap_data_noack > LAST_ACK_TIMEOUT) + goto rst; - conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn, ts); - } + conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_data_from_sock(c, conn, ts); } - break; - case CLOSE_WAIT: - case FIN_WAIT_1_SOCK_FIN: - if (tap_data_noack > FIN_TIMEOUT) - tcp_rst(c, conn); - break; - case FIN_WAIT_1: - if (sock_act > FIN_TIMEOUT) - tcp_rst(c, conn); - break; - case LAST_ACK: + return; + } + + if (conn->events & TAP_FIN_SENT && tap_data_noack > FIN_TIMEOUT) + goto rst; + + if (conn->events & SOCK_FIN_SENT && sock_act > FIN_TIMEOUT) + goto rst; + + if (conn->events & SOCK_FIN_SENT && conn->events & SOCK_FIN_RCVD) { if (sock_act > LAST_ACK_TIMEOUT || tap_act > LAST_ACK_TIMEOUT) - tcp_rst(c, conn); - break; - case TAP_SYN_SENT: - case SPLICE_ACCEPTED: - case SPLICE_CONNECT: - case SPLICE_ESTABLISHED: - case SPLICE_FIN_FROM: - case SPLICE_FIN_TO: - case SPLICE_FIN_BOTH: - break; + goto rst; } + + return; +rst: + tcp_rst(c, conn); } /** @@ -3904,6 +3288,8 @@ void tcp_timer(struct ctx *c, struct timespec *now) c->tcp.port_detect_ts = *now; } + + tcp_splice_timer(c, now); } if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) { @@ -3913,41 +3299,9 @@ void tcp_timer(struct ctx *c, struct timespec *now) if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) || (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0)) NS_CALL(tcp_sock_refill, &refill_arg); - - tcp_splice_pipe_refill(c); } } - for (i = c->tcp.tap_conn_count - 1; i >= 0; i--) - tcp_timer_one(c, tt + i, now); - - if (c->mode == MODE_PASTA) { - for (i = c->tcp.splice_conn_count - 1; i >= 0; i--) { - if ((ts + i)->state == CLOSED) { - tcp_splice_destroy(c, ts + i); - continue; - } - - if (bitmap_isset(splice_rcvlowat_set[0], i) && - !bitmap_isset(splice_rcvlowat_act[0], i)) { - int lowat = 1; - - setsockopt((ts + i)->from, SOL_SOCKET, - SO_RCVLOWAT, &lowat, sizeof(lowat)); - bitmap_clear(splice_rcvlowat_set[0], i); - } - - if (bitmap_isset(splice_rcvlowat_set[1], i) && - !bitmap_isset(splice_rcvlowat_act[1], i)) { - int lowat = 1; - - setsockopt((ts + i)->to, SOL_SOCKET, - SO_RCVLOWAT, &lowat, sizeof(lowat)); - bitmap_clear(splice_rcvlowat_set[1], i); - } - - bitmap_clear(splice_rcvlowat_act[0], i); - bitmap_clear(splice_rcvlowat_act[1], i); - } - } + for (i = c->tcp.conn_count - 1; i >= 0; i--) + tcp_timer_one(c, CONN(i), now); } diff --git a/tcp.h b/tcp.h index 512ee76..b4e3fde 100644 --- a/tcp.h +++ b/tcp.h @@ -11,6 +11,8 @@ #define TCP_MAX_CONNS (128 * 1024) #define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2) +#define TCP_SOCK_POOL_SIZE 32 + struct ctx; void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, @@ -19,7 +21,9 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, struct tap_l4_msg *msg, int count, struct timespec *now); int tcp_sock_init(struct ctx *c, struct timespec *now); void tcp_timer(struct ctx *c, struct timespec *now); -void tcp_defer_handler(struct ctx *c); +void tcp_defer_handler(struct ctx *c, struct timespec *now); + +void tcp_sock_set_bufsize(struct ctx *c, int s); void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s, const uint32_t *ip_da); void tcp_remap_to_tap(in_port_t port, in_port_t delta); @@ -46,7 +50,7 @@ union tcp_epoll_ref { /** * struct tcp_ctx - Execution context for TCP routines * @hash_secret: 128-bit secret for hash functions, ISN and hash table - * @tap_conn_count: Count of tap connections in connection table + * @conn_count: Count of connections (not spliced) in connection table * @splice_conn_count: Count of spliced connections in connection table * @port_to_tap: Ports bound host-side, packets to tap or spliced * @init_detect_ports: If set, periodically detect ports bound in init @@ -60,7 +64,7 @@ union tcp_epoll_ref { */ struct tcp_ctx { uint64_t hash_secret[2]; - int tap_conn_count; + int conn_count; int splice_conn_count; uint8_t port_to_tap [USHRT_MAX / 8]; int init_detect_ports; diff --git a/tcp_splice.c b/tcp_splice.c new file mode 100644 index 0000000..cb8df7b --- /dev/null +++ b/tcp_splice.c @@ -0,0 +1,859 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + +/* PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * tcp_splice.c - direct namespace forwarding for local connections + * + * Copyright (c) 2020-2022 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +/** + * DOC: Theory of Operation + * + * + * For local traffic directed to TCP ports configured for direct mapping between + * namespaces, packets are directly translated between L4 sockets using a pair + * of splice() syscalls. These connections are tracked in the @tc array of + * struct tcp_splice_conn, using these events: + * + * - SPLICE_CONNECT: connection accepted, connecting to target + * - SPLICE_ESTABLISHED: connection to target established + * - SPLICE_A_OUT_WAIT: pipe to accepted socket full, wait for EPOLLOUT + * - SPLICE_B_OUT_WAIT: pipe to target socket full, wait for EPOLLOUT + * - SPLICE_A_FIN_RCVD: FIN (EPOLLRDHUP) seen from accepted socket + * - SPLICE_B_FIN_RCVD: FIN (EPOLLRDHUP) seen from target socket + * - SPLICE_A_FIN_RCVD: FIN (write shutdown) sent to accepted socket + * - SPLICE_B_FIN_RCVD: FIN (write shutdown) sent to target socket + * + * #syscalls:pasta pipe2|pipe fcntl armv6l:fcntl64 armv7l:fcntl64 ppc64:fcntl64 + */ + +#include <sched.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <stdint.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#include <net/ethernet.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <sys/epoll.h> +#include <sys/types.h> +#include <sys/socket.h> + +#include "util.h" +#include "passt.h" + +#define MAX_PIPE_SIZE (2UL * 1024 * 1024) +#define TCP_SPLICE_MAX_CONNS (128 * 1024) +#define TCP_SPLICE_PIPE_POOL_SIZE 16 +#define REFILL_INTERVAL 1000 /* ms, refill pool of pipes */ + +/* From tcp.c */ +extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; +extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +extern int ns_sock_pool4 [TCP_SOCK_POOL_SIZE]; +extern int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; + +/* Pool of pre-opened pipes */ +static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; + +/** + * struct tcp_splice_conn - Descriptor for a spliced TCP connection + * @a: File descriptor number of socket for accepted connection + * @pipe_a_b: Pipe ends for splice() from @a to @b + * @b: File descriptor number of peer connected socket + * @pipe_b_a: Pipe ends for splice() from @b to @a + * @events: Events observed/actions performed on connection + * @flags: Connection flags (attributes, not events) + * @a_read: Bytes read from @a (not fully written to @b in one shot) + * @a_written: Bytes written to @a (not fully written from one @b read) + * @b_read: Bytes read from @b (not fully written to @a in one shot) + * @b_written: Bytes written to @b (not fully written from one @a read) +*/ +struct tcp_splice_conn { + int a; + int pipe_a_b[2]; + int b; + int pipe_b_a[2]; + + uint8_t events; +#define SPLICE_CLOSED 0 +#define SPLICE_CONNECT BIT(0) +#define SPLICE_ESTABLISHED BIT(1) +#define SPLICE_A_OUT_WAIT BIT(2) +#define SPLICE_B_OUT_WAIT BIT(3) +#define SPLICE_A_FIN_RCVD BIT(4) +#define SPLICE_B_FIN_RCVD BIT(5) +#define SPLICE_A_FIN_SENT BIT(6) +#define SPLICE_B_FIN_SENT BIT(7) + + uint8_t flags; +#define SPLICE_V6 BIT(0) +#define SPLICE_IN_EPOLL BIT(1) +#define SPLICE_RCVLOWAT_SET_A BIT(2) +#define SPLICE_RCVLOWAT_SET_B BIT(3) +#define SPLICE_RCVLOWAT_ACT_A BIT(4) +#define SPLICE_RCVLOWAT_ACT_B BIT(5) +#define SPLICE_CLOSING BIT(6) + + uint64_t a_read; + uint64_t a_written; + uint64_t b_read; + uint64_t b_written; +}; + +#define CONN_V6(x) (x->flags & SPLICE_V6) +#define CONN_V4(x) (!CONN_V6(x)) +#define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) +#define CONN(index) (tc + (index)) + +/* Spliced connections */ +static struct tcp_splice_conn tc[TCP_SPLICE_MAX_CONNS]; + +/* Display strings for connection events */ +static const char *tcp_splice_event_str[] __attribute((__unused__)) = { + "SPLICE_CONNECT", "SPLICE_ESTABLISHED", + "SPLICE_A_OUT_WAIT", "SPLICE_B_OUT_WAIT", + "SPLICE_A_FIN_RCVD", "SPLICE_B_FIN_RCVD", + "SPLICE_A_FIN_SENT", "SPLICE_B_FIN_SENT", +}; + +/* Display strings for connection flags */ +static const char *tcp_splice_flag_str[] __attribute((__unused__)) = { + "V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B", + "RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING", +}; + +/** + * tcp_splice_conn_epoll_events() - epoll events masks for given state + * @events: Connection event flags + * @a: Event mask for socket with accepted connection, set on return + * @b: Event mask for connection target socket, set on return + */ +static void tcp_splice_conn_epoll_events(uint16_t events, + uint32_t *a, uint32_t *b) +{ + *a = *b = 0; + + if (events & SPLICE_CLOSED) + return; + + if (events & SPLICE_ESTABLISHED) + *a = *b = EPOLLIN | EPOLLRDHUP; + else if (events & SPLICE_CONNECT) + *b = EPOLLOUT; + + *a |= (events & SPLICE_A_OUT_WAIT) ? EPOLLOUT : 0; + *b |= (events & SPLICE_B_OUT_WAIT) ? EPOLLOUT : 0; +} + +static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn); + +/** + * conn_flag_do() - Set/unset given flag, log, update epoll on SPLICE_CLOSING + * @c: Execution context + * @conn: Connection pointer + * @flag: Flag to set, or ~flag to unset + */ +static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn, + unsigned long flag) +{ + if (flag & (flag - 1)) { + if (!(conn->flags & ~flag)) + return; + + conn->flags &= flag; + debug("TCP (spliced): index %i: %s dropped", (conn) - tc, + tcp_splice_flag_str[fls(~flag)]); + } else { + if (conn->flags & flag) + return; + + conn->flags |= flag; + debug("TCP (spliced): index %i: %s", (conn) - tc, + tcp_splice_flag_str[fls(flag)]); + } + + if (flag == SPLICE_CLOSING) + tcp_splice_epoll_ctl(c, conn); +} + +#define conn_flag(c, conn, flag) \ + do { \ + trace("TCP (spliced): flag at %s:%i", \ + __func__, __LINE__); \ + conn_flag_do(c, conn, flag); \ + } while (0) + +/** + * tcp_splice_epoll_ctl() - Add/modify/delete epoll state from connection events + * @c: Execution context + * @conn: Connection pointer + * + * Return: 0 on success, negative error code on failure (not on deletion) + */ +static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn) +{ + int m = (conn->flags & SPLICE_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, + .r.p.tcp.tcp.splice = 1, + .r.p.tcp.tcp.index = conn - tc, + .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b, + .r.p.tcp.tcp.splice = 1, + .r.p.tcp.tcp.index = conn - tc, + .r.p.tcp.tcp.v6 = CONN_V6(conn) }; + struct epoll_event ev_a = { .data.u64 = ref_a.u64 }; + struct epoll_event ev_b = { .data.u64 = ref_b.u64 }; + uint32_t events_a, events_b; + + if (conn->flags & SPLICE_CLOSING) { + if (conn->flags & SPLICE_IN_EPOLL) + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a); + + if (conn->events & SPLICE_CONNECT) + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b); + + return 0; + } + + tcp_splice_conn_epoll_events(conn->events, &events_a, &events_b); + ev_a.events = events_a; + ev_b.events = events_b; + + if (epoll_ctl(c->epollfd, m, conn->a, &ev_a) || + epoll_ctl(c->epollfd, m, conn->b, &ev_b)) + goto err; + + conn->flags |= SPLICE_IN_EPOLL; /* No need to log this */ + + return 0; + +err: + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a); + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b); + return -errno; +} + +/** + * conn_event_do() - Set and log connection events, update epoll state + * @c: Execution context + * @conn: Connection pointer + * @event: Connection event + */ +static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn, + unsigned long event) +{ + if (event == SPLICE_CLOSED) { + conn->events = SPLICE_CLOSED; + debug("TCP (spliced): index %i, CLOSED", conn - tc); + return; + } + + if (event & (event - 1)) { + if (!(conn->events & ~event)) + return; + + conn->events &= event; + debug("TCP (spliced): index %i, ~%s", conn - tc, + tcp_splice_event_str[fls(~event)]); + } else { + if (conn->events & event) + return; + + conn->events |= event; + debug("TCP (spliced): index %i, %s", conn - tc, + tcp_splice_event_str[fls(event)]); + } + + if (tcp_splice_epoll_ctl(c, conn)) + conn_flag(c, conn, SPLICE_CLOSING); +} + +#define conn_event(c, conn, event) \ + do { \ + trace("TCP (spliced): event at %s:%i", \ + __func__, __LINE__); \ + conn_event_do(c, conn, event); \ + } while (0) + +/** + * tcp_table_splice_compact - Compact spliced connection table + * @c: Execution context + * @hole: Pointer to recently closed connection + */ +static void tcp_table_splice_compact(struct ctx *c, + struct tcp_splice_conn *hole) +{ + struct tcp_splice_conn *move; + + if ((hole - tc) == --c->tcp.splice_conn_count) { + debug("TCP (spliced): index %i (max) removed", hole - tc); + return; + } + + move = CONN(c->tcp.splice_conn_count); + + memcpy(hole, move, sizeof(*hole)); + + move->a = move->b = -1; + move->flags = move->events = 0; + move->a_read = move->a_written = move->b_read = move->b_written = 0; + + debug("TCP (spliced): index %i moved to %i", move - tc, hole - tc); + if (tcp_splice_epoll_ctl(c, hole)) + conn_flag(c, hole, SPLICE_CLOSING); +} + +/** + * tcp_splice_destroy() - Close spliced connection and pipes, drop from epoll + * @c: Execution context + * @conn: Connection pointer + */ +static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) +{ + if (conn->events & SPLICE_ESTABLISHED) { + /* Flushing might need to block: don't recycle them. */ + if (conn->pipe_a_b[0] != -1) { + close(conn->pipe_a_b[0]); + close(conn->pipe_a_b[1]); + conn->pipe_a_b[0] = conn->pipe_a_b[1] = -1; + } + if (conn->pipe_b_a[0] != -1) { + close(conn->pipe_b_a[0]); + close(conn->pipe_b_a[1]); + conn->pipe_b_a[0] = conn->pipe_b_a[1] = -1; + } + } + + if (conn->events & SPLICE_CONNECT) { + close(conn->b); + conn->b = -1; + } + + conn_event(c, conn, SPLICE_CLOSED); + + close(conn->a); + conn->a = -1; + conn->flags = 0; + conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0; + + tcp_table_splice_compact(c, conn); +} + +/** + * tcp_splice_connect_finish() - Completion of connect() or call on success + * @c: Execution context + * @conn: Connection pointer + * + * Return: 0 on success, -EIO on failure + */ +static int tcp_splice_connect_finish(struct ctx *c, + struct tcp_splice_conn *conn) +{ + int i; + + conn->pipe_a_b[0] = conn->pipe_b_a[0] = -1; + conn->pipe_a_b[1] = conn->pipe_b_a[1] = -1; + + for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { + if (splice_pipe_pool[i][0][0] > 0) { + SWAP(conn->pipe_a_b[0], splice_pipe_pool[i][0][0]); + SWAP(conn->pipe_a_b[1], splice_pipe_pool[i][0][1]); + + SWAP(conn->pipe_b_a[0], splice_pipe_pool[i][1][0]); + SWAP(conn->pipe_b_a[1], splice_pipe_pool[i][1][1]); + break; + } + } + + if (conn->pipe_a_b[0] < 0) { + if (pipe2(conn->pipe_a_b, O_NONBLOCK) || + pipe2(conn->pipe_b_a, O_NONBLOCK)) { + conn_flag(c, conn, SPLICE_CLOSING); + return -EIO; + } + + fcntl(conn->pipe_a_b[0], F_SETPIPE_SZ, c->tcp.pipe_size); + fcntl(conn->pipe_b_a[0], F_SETPIPE_SZ, c->tcp.pipe_size); + } + + if (!(conn->events & SPLICE_ESTABLISHED)) + conn_event(c, conn, SPLICE_ESTABLISHED); + + return 0; +} + +/** + * tcp_splice_connect() - Create and connect socket for new spliced connection + * @c: Execution context + * @conn: Connection pointer + * @s: Accepted socket + * @port: Destination port, host order + * + * Return: 0 for connect() succeeded or in progress, negative value on error + */ +static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn, + int s, in_port_t port) +{ + int sock_conn = (s >= 0) ? s : socket(CONN_V6(conn) ? AF_INET6 : + AF_INET, + SOCK_STREAM | SOCK_NONBLOCK, + IPPROTO_TCP); + struct sockaddr_in6 addr6 = { + .sin6_family = AF_INET6, + .sin6_port = htons(port), + .sin6_addr = IN6ADDR_LOOPBACK_INIT, + }; + struct sockaddr_in addr4 = { + .sin_family = AF_INET, + .sin_port = htons(port), + .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) }, + }; + const struct sockaddr *sa; + socklen_t sl; + + conn->b = sock_conn; + + if (s < 0) + tcp_sock_set_bufsize(c, conn->b); + + setsockopt(conn->b, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int)); + + if (CONN_V6(conn)) { + sa = (struct sockaddr *)&addr6; + sl = sizeof(addr6); + } else { + sa = (struct sockaddr *)&addr4; + sl = sizeof(addr4); + } + + if (connect(conn->b, sa, sl)) { + if (errno != EINPROGRESS) { + int ret = -errno; + + close(sock_conn); + return ret; + } + conn_event(c, conn, SPLICE_CONNECT); + } else { + conn_event(c, conn, SPLICE_ESTABLISHED); + return tcp_splice_connect_finish(c, conn); + } + + return 0; +} + +/** + * struct tcp_splice_connect_ns_arg - Arguments for tcp_splice_connect_ns() + * @c: Execution context + * @conn: Accepted inbound connection + * @port: Destination port, host order + * @ret: Return value of tcp_splice_connect_ns() + */ +struct tcp_splice_connect_ns_arg { + struct ctx *c; + struct tcp_splice_conn *conn; + in_port_t port; + int ret; +}; + +/** + * tcp_splice_connect_ns() - Enter namespace and call tcp_splice_connect() + * @arg: See struct tcp_splice_connect_ns_arg + * + * Return: 0 + */ +static int tcp_splice_connect_ns(void *arg) +{ + struct tcp_splice_connect_ns_arg *a; + + a = (struct tcp_splice_connect_ns_arg *)arg; + ns_enter(a->c); + a->ret = tcp_splice_connect(a->c, a->conn, -1, a->port); + return 0; +} + +/** + * tcp_splice_new() - Handle new inbound, spliced connection + * @c: Execution context + * @conn: Connection pointer + * @port: Destination port, host order + * + * Return: return code from connect() + */ +static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn, + in_port_t port) +{ + struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 }; + int *sock_pool_p, i, s = -1; + + if (bitmap_isset(c->tcp.port_to_tap, port)) + sock_pool_p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; + else + sock_pool_p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; + + for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, sock_pool_p++) { + if ((s = *sock_pool_p) >= 0) { + *sock_pool_p = -1; + break; + } + } + + if (s < 0 && bitmap_isset(c->tcp.port_to_tap, port)) { + NS_CALL(tcp_splice_connect_ns, &ns_arg); + return ns_arg.ret; + } + + return tcp_splice_connect(c, conn, s, port); +} + +/** + * tcp_splice_dir() - Set sockets/pipe pointers reflecting flow direction + * @conn: Connection pointers + * @ref_sock: Socket returned as reference from epoll + * @reverse: Reverse direction: @ref_sock is used as destination + * @from: Destination socket pointer to set + * @to: Source socket pointer to set + * @pipes: Pipe set, assigned on return + */ +static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock, + int reverse, int *from, int *to, int **pipes) +{ + if (!reverse) { + *from = ref_sock; + *to = (*from == conn->a) ? conn->b : conn->a; + } else { + *to = ref_sock; + *from = (*to == conn->a) ? conn->b : conn->a; + } + + *pipes = *from == conn->a ? conn->pipe_a_b : conn->pipe_b_a; +} + +/** + * tcp_sock_handler_splice() - Handler for socket mapped to spliced connection + * @c: Execution context + * @ref: epoll reference + * @events: epoll events bitmap + * + * #syscalls:pasta splice + */ +void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, + uint32_t events) +{ + uint8_t lowat_set_flag, lowat_act_flag; + int from, to, *pipes, eof, never_read; + uint64_t *seq_read, *seq_write; + struct tcp_splice_conn *conn; + + if (ref.r.p.tcp.tcp.listen) { + int s; + + if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS) + return; + + if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0) + return; + + setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), + sizeof(int)); + + conn = CONN(c->tcp.splice_conn_count++); + conn->a = s; + conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0; + + if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index)) + conn_flag(c, conn, SPLICE_CLOSING); + + return; + } + + conn = CONN(ref.r.p.tcp.tcp.index); + + if (events & EPOLLERR || events & EPOLLHUP) + goto close; + + if (conn->events == SPLICE_CONNECT) { + if (!(events & EPOLLOUT)) + goto close; + if (tcp_splice_connect_finish(c, conn)) + goto close; + } + + if (events & EPOLLOUT) { + if (ref.r.s == conn->a) + conn_event(c, conn, ~SPLICE_A_OUT_WAIT); + else + conn_event(c, conn, ~SPLICE_B_OUT_WAIT); + + tcp_splice_dir(conn, ref.r.s, 1, &from, &to, &pipes); + } else { + tcp_splice_dir(conn, ref.r.s, 0, &from, &to, &pipes); + } + + if (events & EPOLLRDHUP) { + if (ref.r.s == conn->a) + conn_event(c, conn, SPLICE_A_FIN_RCVD); + else + conn_event(c, conn, SPLICE_B_FIN_RCVD); + } + +swap: + eof = 0; + never_read = 1; + + if (from == conn->a) { + seq_read = &conn->a_read; + seq_write = &conn->a_written; + lowat_set_flag = SPLICE_RCVLOWAT_SET_A; + lowat_act_flag = SPLICE_RCVLOWAT_ACT_A; + } else { + seq_read = &conn->b_read; + seq_write = &conn->b_written; + lowat_set_flag = SPLICE_RCVLOWAT_SET_B; + lowat_act_flag = SPLICE_RCVLOWAT_ACT_B; + } + + while (1) { + int retry_write = 0, more = 0; + ssize_t readlen, to_write = 0, written; + +retry: + readlen = splice(from, NULL, pipes[1], NULL, c->tcp.pipe_size, + SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + if (readlen < 0) { + if (errno == EINTR) + goto retry; + + if (errno != EAGAIN) + goto close; + + to_write = c->tcp.pipe_size; + } else if (!readlen) { + eof = 1; + to_write = c->tcp.pipe_size; + } else { + never_read = 0; + to_write += readlen; + if (readlen >= (long)c->tcp.pipe_size * 90 / 100) + more = SPLICE_F_MORE; + + if (conn->flags & lowat_set_flag) + conn_flag(c, conn, lowat_act_flag); + } + +eintr: + written = splice(pipes[0], NULL, to, NULL, to_write, + SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + + /* Most common case: skip updating counters. */ + if (readlen > 0 && readlen == written) { + if (readlen >= (long)c->tcp.pipe_size * 10 / 100) + continue; + + if (conn->flags & lowat_set_flag && + readlen > (long)c->tcp.pipe_size / 10) { + int lowat = c->tcp.pipe_size / 4; + + setsockopt(from, SOL_SOCKET, SO_RCVLOWAT, + &lowat, sizeof(lowat)); + + conn_flag(c, conn, lowat_set_flag); + conn_flag(c, conn, lowat_act_flag); + } + + break; + } + + *seq_read += readlen > 0 ? readlen : 0; + *seq_write += written > 0 ? written : 0; + + if (written < 0) { + if (errno == EINTR) + goto eintr; + + if (errno != EAGAIN) + goto close; + + if (never_read) + break; + + if (retry_write--) + goto retry; + + if (to == conn->a) + conn_event(c, conn, SPLICE_A_OUT_WAIT); + else + conn_event(c, conn, SPLICE_B_OUT_WAIT); + break; + } + + if (never_read && written == (long)(c->tcp.pipe_size)) + goto retry; + + if (!never_read && written < to_write) { + to_write -= written; + goto retry; + } + + if (eof) + break; + } + + if ( (conn->events & SPLICE_A_FIN_RCVD) && + !(conn->events & SPLICE_B_FIN_SENT)) { + if (*seq_read == *seq_write) { + shutdown(conn->b, SHUT_WR); + conn_event(c, conn, SPLICE_B_FIN_SENT); + } + } + + if ( (conn->events & SPLICE_B_FIN_RCVD) && + !(conn->events & SPLICE_A_FIN_SENT)) { + if (*seq_read == *seq_write) { + shutdown(conn->a, SHUT_WR); + conn_event(c, conn, SPLICE_A_FIN_SENT); + } + } + + if (CONN_HAS(conn, SPLICE_A_FIN_SENT | SPLICE_B_FIN_SENT)) + goto close; + + if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) { + events = EPOLLIN; + + SWAP(from, to); + if (pipes == conn->pipe_a_b) + pipes = conn->pipe_b_a; + else + pipes = conn->pipe_a_b; + + goto swap; + } + + return; + +close: + conn_flag(c, conn, SPLICE_CLOSING); +} + +/** + * tcp_set_pipe_size() - Set usable pipe size, probe starting from MAX_PIPE_SIZE + * @c: Execution context + */ +static void tcp_set_pipe_size(struct ctx *c) +{ + int probe_pipe[TCP_SPLICE_PIPE_POOL_SIZE * 2][2], i, j; + + c->tcp.pipe_size = MAX_PIPE_SIZE; + +smaller: + for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE * 2; i++) { + if (pipe2(probe_pipe[i], 0)) { + i++; + break; + } + + if (fcntl(probe_pipe[i][0], F_SETPIPE_SZ, c->tcp.pipe_size) < 0) + break; + } + + for (j = i - 1; j >= 0; j--) { + close(probe_pipe[j][0]); + close(probe_pipe[j][1]); + } + + if (i == TCP_SPLICE_PIPE_POOL_SIZE * 2) + return; + + if (!(c->tcp.pipe_size /= 2)) { + c->tcp.pipe_size = MAX_PIPE_SIZE; + return; + } + + goto smaller; +} + +/** + * tcp_splice_pipe_refill() - Refill pool of pre-opened pipes + * @c: Execution context + */ +static void tcp_splice_pipe_refill(struct ctx *c) +{ + int i; + + for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { + if (splice_pipe_pool[i][0][0] >= 0) + break; + if (pipe2(splice_pipe_pool[i][0], O_NONBLOCK)) + continue; + if (pipe2(splice_pipe_pool[i][1], O_NONBLOCK)) { + close(splice_pipe_pool[i][1][0]); + close(splice_pipe_pool[i][1][1]); + continue; + } + + fcntl(splice_pipe_pool[i][0][0], F_SETPIPE_SZ, + c->tcp.pipe_size); + fcntl(splice_pipe_pool[i][1][0], F_SETPIPE_SZ, + c->tcp.pipe_size); + } +} + +/** + * tcp_splice_init() - Initialise pipe pool and size + * @c: Execution context + */ +void tcp_splice_init(struct ctx *c) +{ + memset(splice_pipe_pool, 0xff, sizeof(splice_pipe_pool)); + tcp_set_pipe_size(c); + tcp_splice_pipe_refill(c); +} + +/** + * tcp_splice_timer() - Timer for spliced connections + * @c: Execution context + * @now: Current timestamp + */ +void tcp_splice_timer(struct ctx *c, struct timespec *now) +{ + int i; + + for (i = c->tcp.splice_conn_count - 1; i >= 0; i--) { + struct tcp_splice_conn *conn; + + conn = CONN(i); + + if (conn->flags & SPLICE_CLOSING) { + tcp_splice_destroy(c, conn); + continue; + } + + if ( (conn->flags & SPLICE_RCVLOWAT_SET_A) && + !(conn->flags & SPLICE_RCVLOWAT_ACT_A)) { + setsockopt(conn->a, SOL_SOCKET, SO_RCVLOWAT, + &((int){ 1 }), sizeof(int)); + conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_A); + } + + if ( (conn->flags & SPLICE_RCVLOWAT_SET_B) && + !(conn->flags & SPLICE_RCVLOWAT_ACT_B)) { + setsockopt(conn->b, SOL_SOCKET, SO_RCVLOWAT, + &((int){ 1 }), sizeof(int)); + conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_B); + } + + conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_A); + conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_B); + } + + if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) + tcp_splice_pipe_refill(c); +} diff --git a/tcp_splice.h b/tcp_splice.h new file mode 100644 index 0000000..45ab1ca --- /dev/null +++ b/tcp_splice.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: AGPL-3.0-or-later + * Copyright (c) 2022 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#define TCP_SPLICE_MAX_CONNS (128 * 1024) + +struct tcp_splice_conn; + +void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, + uint32_t events); +void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); +void tcp_splice_init(struct ctx *c); +void tcp_splice_timer(struct ctx *c, struct timespec *now); diff --git a/util.c b/util.c index 50b83db..2d8952a 100644 --- a/util.c +++ b/util.c @@ -589,3 +589,22 @@ int __daemon(int pidfile_fd, int devnull_fd) return 0; } + +/** + * fls() - Find last (most significant) bit set in word + * @x: Word + * + * Return: position of most significant bit set, starting from 0, -1 if none + */ +int fls(unsigned long x) +{ + int y = 0; + + if (!x) + return -1; + + while (x >>= 1) + y++; + + return y; +} diff --git a/util.h b/util.h index e314c71..3073f58 100644 --- a/util.h +++ b/util.h @@ -37,7 +37,8 @@ void trace_init(int enable); #define ROUND_DOWN(x, y) ((x) & ~((y) - 1)) #define ROUND_UP(x, y) (((x) + (y) - 1) & ~((y) - 1)) -#define BITMAP_BIT(n) (1UL << (n) % (sizeof(long) * 8)) +#define BIT(n) (1UL << (n)) +#define BITMAP_BIT(n) (BIT((n) % (sizeof(long) * 8))) #define BITMAP_WORD(n) (n / (sizeof(long) * 8)) #define SWAP(a, b) \ @@ -208,3 +209,4 @@ void drop_caps(void); int ns_enter(struct ctx *c); void write_pidfile(int fd, pid_t pid); int __daemon(int pidfile_fd, int devnull_fd); +int fls(unsigned long x); -- 2.35.1
...so that they can be indexed. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- test/lib/video | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lib/video b/test/lib/video index ec79c85..70a6359 100755 --- a/test/lib/video +++ b/test/lib/video @@ -141,7 +141,7 @@ video_time_now() { video_link() { [ ${VIDEO_LINKS_COUNT} -eq 0 ] && __sep="" || __sep=" |" __id="video_link_${VIDEO_LINKS_COUNT}" - video_append_links "${__sep} <a id=\"${__id}\">${1}</a>" + video_append_links "${__sep} <a id=\"${__id}\" href=\"${1}\">${1}</a>" video_append_links_js "[ '${__id}', $(($(video_time_now) - 1)) ]," VIDEO_LINKS_COUNT=$((VIDEO_LINKS_COUNT + 1)) -- 2.35.1
It's already implied by the fact they don't have "l2" in their names, and dropping it improves readability a bit. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- udp.c | 52 +++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/udp.c b/udp.c index ccce005..ce536a6 100644 --- a/udp.c +++ b/udp.c @@ -254,14 +254,14 @@ static struct mmsghdr udp4_l2_mh_tap [UDP_TAP_FRAMES_MEM]; static struct mmsghdr udp6_l2_mh_tap [UDP_TAP_FRAMES_MEM]; /* recvmmsg()/sendmmsg() data for "spliced" connections */ -static struct iovec udp_splice_iov_recv [UDP_SPLICE_FRAMES]; -static struct mmsghdr udp_splice_mmh_recv [UDP_SPLICE_FRAMES]; +static struct iovec udp_iov_recv [UDP_SPLICE_FRAMES]; +static struct mmsghdr udp_mmh_recv [UDP_SPLICE_FRAMES]; -static struct iovec udp_splice_iov_send [UDP_SPLICE_FRAMES]; -static struct mmsghdr udp_splice_mmh_send [UDP_SPLICE_FRAMES]; +static struct iovec udp_iov_send [UDP_SPLICE_FRAMES]; +static struct mmsghdr udp_mmh_send [UDP_SPLICE_FRAMES]; -static struct iovec udp_splice_iov_sendto [UDP_SPLICE_FRAMES]; -static struct mmsghdr udp_splice_mmh_sendto [UDP_SPLICE_FRAMES]; +static struct iovec udp_iov_sendto [UDP_SPLICE_FRAMES]; +static struct mmsghdr udp_mmh_sendto [UDP_SPLICE_FRAMES]; /** * udp_remap_to_tap() - Set delta for port translation to/from guest/tap @@ -552,14 +552,14 @@ static void udp_sock_handler_splice(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now) { in_port_t src, dst = ref.r.p.udp.udp.port, send_dst = 0; - struct msghdr *mh = &udp_splice_mmh_recv[0].msg_hdr; + struct msghdr *mh = &udp_mmh_recv[0].msg_hdr; struct sockaddr_storage *sa_s = mh->msg_name; int s, v6 = ref.r.p.udp.udp.v6, n, i; if (!(events & EPOLLIN)) return; - n = recvmmsg(ref.r.s, udp_splice_mmh_recv, UDP_SPLICE_FRAMES, 0, NULL); + n = recvmmsg(ref.r.s, udp_mmh_recv, UDP_SPLICE_FRAMES, 0, NULL); if (n <= 0) return; @@ -619,19 +619,19 @@ static void udp_sock_handler_splice(struct ctx *c, union epoll_ref ref, if (ref.r.p.udp.udp.splice == UDP_TO_NS || ref.r.p.udp.udp.splice == UDP_TO_INIT) { for (i = 0; i < n; i++) { - struct msghdr *mh_s = &udp_splice_mmh_send[i].msg_hdr; + struct msghdr *mh_s = &udp_mmh_send[i].msg_hdr; - mh_s->msg_iov->iov_len = udp_splice_mmh_recv[i].msg_len; + mh_s->msg_iov->iov_len = udp_mmh_recv[i].msg_len; } - sendmmsg(s, udp_splice_mmh_send, n, MSG_NOSIGNAL); + sendmmsg(s, udp_mmh_send, n, MSG_NOSIGNAL); return; } for (i = 0; i < n; i++) { - struct msghdr *mh_s = &udp_splice_mmh_sendto[i].msg_hdr; + struct msghdr *mh_s = &udp_mmh_sendto[i].msg_hdr; - mh_s->msg_iov->iov_len = udp_splice_mmh_recv[i].msg_len; + mh_s->msg_iov->iov_len = udp_mmh_recv[i].msg_len; } if (v6) { @@ -652,7 +652,7 @@ static void udp_sock_handler_splice(struct ctx *c, union epoll_ref ref, }); } - sendmmsg(s, udp_splice_mmh_sendto, n, MSG_NOSIGNAL); + sendmmsg(s, udp_mmh_sendto, n, MSG_NOSIGNAL); } /** @@ -1097,7 +1097,7 @@ static void udp_splice_iov_init(void) struct iovec *iov; int i; - for (i = 0, h = udp_splice_mmh_recv; i < UDP_SPLICE_FRAMES; i++, h++) { + for (i = 0, h = udp_mmh_recv; i < UDP_SPLICE_FRAMES; i++, h++) { struct msghdr *mh = &h->msg_hdr; if (!i) { @@ -1105,40 +1105,34 @@ static void udp_splice_iov_init(void) mh->msg_namelen = sizeof(udp_splice_namebuf); } - mh->msg_iov = &udp_splice_iov_recv[i]; + mh->msg_iov = &udp_iov_recv[i]; mh->msg_iovlen = 1; } - for (i = 0, iov = udp_splice_iov_recv; i < UDP_SPLICE_FRAMES; - i++, iov++) { + for (i = 0, iov = udp_iov_recv; i < UDP_SPLICE_FRAMES; i++, iov++) { iov->iov_base = udp_splice_buf[i]; iov->iov_len = sizeof(udp_splice_buf[i]); } - for (i = 0, h = udp_splice_mmh_send; i < UDP_SPLICE_FRAMES; i++, h++) { + for (i = 0, h = udp_mmh_send; i < UDP_SPLICE_FRAMES; i++, h++) { struct msghdr *mh = &h->msg_hdr; - mh->msg_iov = &udp_splice_iov_send[i]; + mh->msg_iov = &udp_iov_send[i]; mh->msg_iovlen = 1; } - for (i = 0, iov = udp_splice_iov_send; i < UDP_SPLICE_FRAMES; - i++, iov++) { + for (i = 0, iov = udp_iov_send; i < UDP_SPLICE_FRAMES; i++, iov++) iov->iov_base = udp_splice_buf[i]; - } - for (i = 0, h = udp_splice_mmh_sendto; i < UDP_SPLICE_FRAMES; - i++, h++) { + for (i = 0, h = udp_mmh_sendto; i < UDP_SPLICE_FRAMES; i++, h++) { struct msghdr *mh = &h->msg_hdr; mh->msg_name = &udp_splice_namebuf; mh->msg_namelen = sizeof(udp_splice_namebuf); - mh->msg_iov = &udp_splice_iov_sendto[i]; + mh->msg_iov = &udp_iov_sendto[i]; mh->msg_iovlen = 1; } - for (i = 0, iov = udp_splice_iov_sendto; i < UDP_SPLICE_FRAMES; - i++, iov++) { + for (i = 0, iov = udp_iov_sendto; i < UDP_SPLICE_FRAMES; i++, iov++) iov->iov_base = udp_splice_buf[i]; - } } /** -- 2.35.1
...it became too hard to follow: split it off to udp_sock_fill_data_v{4,6}. While at it, use IN6_ARE_ADDR_EQUAL(a, b), courtesy of netinet/in.h, instead of open-coded memcmp(). Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- udp.c | 364 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 193 insertions(+), 171 deletions(-) diff --git a/udp.c b/udp.c index ce536a6..ebbcda1 100644 --- a/udp.c +++ b/udp.c @@ -655,6 +655,177 @@ static void udp_sock_handler_splice(struct ctx *c, union epoll_ref ref, sendmmsg(s, udp_mmh_sendto, n, MSG_NOSIGNAL); } +/** + * udp_sock_fill_data_v4() - Fill and queue one buffer. In pasta mode, write it + * @c: Execution context + * @n: Index of buffer in udp4_l2_buf pool + * @ref: epoll reference from socket + * @msg_idx: Index within message being prepared (spans multiple buffers) + * @msg_len: Length of current message being prepared for sending + * @now: Current timestamp + */ +static void udp_sock_fill_data_v4(struct ctx *c, int n, union epoll_ref ref, + int *msg_idx, int *msg_bufs, ssize_t *msg_len, + struct timespec *now) +{ + struct msghdr *mh = &udp6_l2_mh_tap[*msg_idx].msg_hdr; + struct udp4_l2_buf_t *b = &udp4_l2_buf[n]; + size_t ip_len, buf_len; + in_port_t src_port; + in_addr_t src; + + ip_len = udp4_l2_mh_sock[n].msg_len + sizeof(b->iph) + sizeof(b->uh); + + b->iph.tot_len = htons(ip_len); + + src = ntohl(b->s_in.sin_addr.s_addr); + src_port = htons(b->s_in.sin_port); + + if (src >> IN_CLASSA_NSHIFT == IN_LOOPBACKNET || + src == INADDR_ANY || src == ntohl(c->addr4_seen)) { + b->iph.saddr = c->gw4; + udp_tap_map[V4][src_port].ts_local = now->tv_sec; + + if (b->s_in.sin_addr.s_addr == c->addr4_seen) + udp_tap_map[V4][src_port].loopback = 0; + else + udp_tap_map[V4][src_port].loopback = 1; + + bitmap_set(udp_act[V4][UDP_ACT_TAP], src_port); + } else if (c->dns4_fwd && + src == ntohl(c->dns4[0]) && ntohs(src_port) == 53) { + b->iph.saddr = c->dns4_fwd; + } else { + b->iph.saddr = b->s_in.sin_addr.s_addr; + } + + udp_update_check4(b); + b->uh.source = b->s_in.sin_port; + b->uh.dest = htons(ref.r.p.udp.udp.port); + b->uh.len = htons(udp4_l2_mh_sock[n].msg_len + sizeof(b->uh)); + + if (c->mode == MODE_PASTA) { + if (write(c->fd_tap, &b->eh, sizeof(b->eh) + ip_len) < 0) + debug("tap write: %s", strerror(errno)); + pcap((char *)&b->eh, sizeof(b->eh) + ip_len); + + return; + } + + b->vnet_len = htonl(ip_len + sizeof(struct ethhdr)); + buf_len = sizeof(uint32_t) + sizeof(struct ethhdr) + ip_len; + udp4_l2_iov_tap[n].iov_len = buf_len; + + /* With bigger messages, qemu closes the connection. */ + if (*msg_bufs && *msg_len + buf_len > SHRT_MAX) { + mh->msg_iovlen = *msg_bufs; + + (*msg_idx)++; + udp4_l2_mh_tap[*msg_idx].msg_hdr.msg_iov = &udp4_l2_iov_tap[n]; + *msg_len = *msg_bufs = 0; + } + + *msg_len += buf_len; + (*msg_bufs)++; +} + +/** + * udp_sock_fill_data_v4() - Fill and queue one buffer. In pasta mode, write it + * @c: Execution context + * @n: Index of buffer in udp4_l2_buf pool + * @ref: epoll reference from socket + * @msg_idx: Index within message being prepared (spans multiple buffers) + * @msg_len: Length of current message being prepared for sending + * @now: Current timestamp + */ +static void udp_sock_fill_data_v6(struct ctx *c, int n, union epoll_ref ref, + int *msg_idx, int *msg_bufs, ssize_t *msg_len, + struct timespec *now) +{ + struct msghdr *mh = &udp6_l2_mh_tap[*msg_idx].msg_hdr; + struct udp6_l2_buf_t *b = &udp6_l2_buf[n]; + size_t ip_len, buf_len; + struct in6_addr *src; + in_port_t src_port; + + src = &b->s_in6.sin6_addr; + src_port = ntohs(b->s_in6.sin6_port); + + ip_len = udp6_l2_mh_sock[n].msg_len + sizeof(b->ip6h) + sizeof(b->uh); + + b->ip6h.payload_len = htons(udp6_l2_mh_sock[n].msg_len + sizeof(b->uh)); + + if (IN6_IS_ADDR_LINKLOCAL(src)) { + b->ip6h.daddr = c->addr6_ll_seen; + b->ip6h.saddr = b->s_in6.sin6_addr; + } else if (IN6_IS_ADDR_LOOPBACK(src) || + IN6_ARE_ADDR_EQUAL(src, &c->addr6_seen) || + IN6_ARE_ADDR_EQUAL(src, &c->addr6)) { + b->ip6h.daddr = c->addr6_ll_seen; + + if (IN6_IS_ADDR_LINKLOCAL(&c->gw6)) + b->ip6h.saddr = c->gw6; + else + b->ip6h.saddr = c->addr6_ll; + + udp_tap_map[V6][src_port].ts_local = now->tv_sec; + + if (IN6_IS_ADDR_LOOPBACK(src)) + udp_tap_map[V6][src_port].loopback = 1; + else + udp_tap_map[V6][src_port].loopback = 0; + + if (IN6_ARE_ADDR_EQUAL(src, &c->addr6)) + udp_tap_map[V6][src_port].gua = 1; + else + udp_tap_map[V6][src_port].gua = 0; + + bitmap_set(udp_act[V6][UDP_ACT_TAP], src_port); + } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->dns6_fwd) && + IN6_ARE_ADDR_EQUAL(src, &c->dns6_fwd) && src_port == 53) { + b->ip6h.daddr = c->addr6_seen; + b->ip6h.saddr = c->dns6_fwd; + } else { + b->ip6h.daddr = c->addr6_seen; + b->ip6h.saddr = b->s_in6.sin6_addr; + } + + b->uh.source = b->s_in6.sin6_port; + b->uh.dest = htons(ref.r.p.udp.udp.port); + b->uh.len = b->ip6h.payload_len; + + b->ip6h.hop_limit = IPPROTO_UDP; + b->ip6h.version = b->ip6h.nexthdr = b->uh.check = 0; + b->uh.check = csum(&b->ip6h, ip_len, 0); + b->ip6h.version = 6; + b->ip6h.nexthdr = IPPROTO_UDP; + b->ip6h.hop_limit = 255; + + if (c->mode == MODE_PASTA) { + if (write(c->fd_tap, &b->eh, sizeof(b->eh) + ip_len) < 0) + debug("tap write: %s", strerror(errno)); + pcap((char *)&b->eh, sizeof(b->eh) + ip_len); + + return; + } + + b->vnet_len = htonl(ip_len + sizeof(struct ethhdr)); + buf_len = sizeof(uint32_t) + sizeof(struct ethhdr) + ip_len; + udp6_l2_iov_tap[n].iov_len = buf_len; + + /* With bigger messages, qemu closes the connection. */ + if (*msg_bufs && *msg_len + buf_len > SHRT_MAX) { + mh->msg_iovlen = *msg_bufs; + + (*msg_idx)++; + udp6_l2_mh_tap[*msg_idx].msg_hdr.msg_iov = &udp6_l2_iov_tap[n]; + *msg_len = *msg_bufs = 0; + } + + *msg_len += buf_len; + (*msg_bufs)++; +} + /** * udp_sock_handler() - Handle new data from socket * @c: Execution context @@ -668,10 +839,10 @@ static void udp_sock_handler_splice(struct ctx *c, union epoll_ref ref, void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now) { - int iov_in_msg, msg_i = 0, ret; - ssize_t n, msglen, missing = 0; + ssize_t n, msg_len = 0, missing = 0; + int msg_bufs = 0, msg_i = 0, ret; struct mmsghdr *tap_mmh; - struct msghdr *cur_mh; + struct msghdr *last_mh; unsigned int i; if (events == EPOLLERR) @@ -687,183 +858,34 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, if (n <= 0) return; - cur_mh = &udp6_l2_mh_tap[msg_i].msg_hdr; - cur_mh->msg_iov = &udp6_l2_iov_tap[0]; - msg_i = msglen = iov_in_msg = 0; + udp6_l2_mh_tap[0].msg_hdr.msg_iov = &udp6_l2_iov_tap[0]; for (i = 0; i < (unsigned)n; i++) { - struct udp6_l2_buf_t *b = &udp6_l2_buf[i]; - size_t ip_len, iov_len; - - ip_len = udp6_l2_mh_sock[i].msg_len + - sizeof(b->ip6h) + sizeof(b->uh); - - b->ip6h.payload_len = htons(udp6_l2_mh_sock[i].msg_len + - sizeof(b->uh)); - - if (IN6_IS_ADDR_LINKLOCAL(&b->s_in6.sin6_addr)) { - b->ip6h.daddr = c->addr6_ll_seen; - b->ip6h.saddr = b->s_in6.sin6_addr; - } else if (IN6_IS_ADDR_LOOPBACK(&b->s_in6.sin6_addr) || - !memcmp(&b->s_in6.sin6_addr, &c->addr6_seen, - sizeof(c->addr6)) || - !memcmp(&b->s_in6.sin6_addr, &c->addr6, - sizeof(c->addr6))) { - in_port_t src = htons(b->s_in6.sin6_port); - - b->ip6h.daddr = c->addr6_ll_seen; - - if (IN6_IS_ADDR_LINKLOCAL(&c->gw6)) - b->ip6h.saddr = c->gw6; - else - b->ip6h.saddr = c->addr6_ll; - - udp_tap_map[V6][src].ts_local = now->tv_sec; - - if (IN6_IS_ADDR_LOOPBACK(&b->s_in6.sin6_addr)) - udp_tap_map[V6][src].loopback = 1; - else - udp_tap_map[V6][src].loopback = 0; - - if (!memcmp(&b->s_in6.sin6_addr, &c->addr6, - sizeof(c->addr6))) - udp_tap_map[V6][src].gua = 1; - else - udp_tap_map[V6][src].gua = 0; - - bitmap_set(udp_act[V6][UDP_ACT_TAP], src); - } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->dns6_fwd) && - !memcmp(&b->s_in6.sin6_addr, &c->dns6_fwd, - sizeof(c->dns6_fwd)) && - ntohs(b->s_in6.sin6_port) == 53) { - b->ip6h.daddr = c->addr6_seen; - b->ip6h.saddr = c->dns6_fwd; - } else { - b->ip6h.daddr = c->addr6_seen; - b->ip6h.saddr = b->s_in6.sin6_addr; - } - - b->uh.source = b->s_in6.sin6_port; - b->uh.dest = htons(ref.r.p.udp.udp.port); - b->uh.len = b->ip6h.payload_len; - - b->ip6h.hop_limit = IPPROTO_UDP; - b->ip6h.version = 0; - b->ip6h.nexthdr = 0; - b->uh.check = 0; - b->uh.check = csum(&b->ip6h, ip_len, 0); - b->ip6h.version = 6; - b->ip6h.nexthdr = IPPROTO_UDP; - b->ip6h.hop_limit = 255; - - if (c->mode == MODE_PASTA) { - ip_len += sizeof(struct ethhdr); - if (write(c->fd_tap, &b->eh, ip_len) < 0) - debug("tap write: %s", strerror(errno)); - pcap((char *)&b->eh, ip_len); - continue; - } - - b->vnet_len = htonl(ip_len + sizeof(struct ethhdr)); - iov_len = sizeof(uint32_t) + sizeof(struct ethhdr) + - ip_len; - udp6_l2_iov_tap[i].iov_len = iov_len; - - /* With bigger messages, qemu closes the connection. */ - if (iov_in_msg && msglen + iov_len > SHRT_MAX) { - cur_mh->msg_iovlen = iov_in_msg; - - cur_mh = &udp6_l2_mh_tap[++msg_i].msg_hdr; - msglen = iov_in_msg = 0; - cur_mh->msg_iov = &udp6_l2_iov_tap[i]; - } - - msglen += iov_len; - iov_in_msg++; + udp_sock_fill_data_v6(c, i, ref, + &msg_i, &msg_bufs, &msg_len, now); } + udp6_l2_mh_tap[msg_i].msg_hdr.msg_iovlen = msg_bufs; tap_mmh = udp6_l2_mh_tap; } else { n = recvmmsg(ref.r.s, udp4_l2_mh_sock, UDP_TAP_FRAMES, 0, NULL); if (n <= 0) return; - cur_mh = &udp4_l2_mh_tap[msg_i].msg_hdr; - cur_mh->msg_iov = &udp4_l2_iov_tap[0]; - msg_i = msglen = iov_in_msg = 0; + udp6_l2_mh_tap[0].msg_hdr.msg_iov = &udp6_l2_iov_tap[0]; for (i = 0; i < (unsigned)n; i++) { - struct udp4_l2_buf_t *b = &udp4_l2_buf[i]; - size_t ip_len, iov_len; - in_addr_t s_addr; - - ip_len = udp4_l2_mh_sock[i].msg_len + - sizeof(b->iph) + sizeof(b->uh); - - b->iph.tot_len = htons(ip_len); - - s_addr = ntohl(b->s_in.sin_addr.s_addr); - if (s_addr >> IN_CLASSA_NSHIFT == IN_LOOPBACKNET || - s_addr == INADDR_ANY || - s_addr == ntohl(c->addr4_seen)) { - in_port_t src = htons(b->s_in.sin_port); - - b->iph.saddr = c->gw4; - udp_tap_map[V4][src].ts_local = now->tv_sec; - - if (b->s_in.sin_addr.s_addr == c->addr4_seen) - udp_tap_map[V4][src].loopback = 0; - else - udp_tap_map[V4][src].loopback = 1; - - bitmap_set(udp_act[V4][UDP_ACT_TAP], src); - } else if (c->dns4_fwd && - s_addr == ntohl(c->dns4[0]) && - ntohs(b->s_in.sin_port) == 53) { - b->iph.saddr = c->dns4_fwd; - } else { - b->iph.saddr = b->s_in.sin_addr.s_addr; - } - - udp_update_check4(b); - b->uh.source = b->s_in.sin_port; - b->uh.dest = htons(ref.r.p.udp.udp.port); - b->uh.len = ntohs(udp4_l2_mh_sock[i].msg_len + - sizeof(b->uh)); - - if (c->mode == MODE_PASTA) { - ip_len += sizeof(struct ethhdr); - if (write(c->fd_tap, &b->eh, ip_len) < 0) - debug("tap write: %s", strerror(errno)); - pcap((char *)&b->eh, ip_len); - continue; - } - - b->vnet_len = htonl(ip_len + sizeof(struct ethhdr)); - iov_len = sizeof(uint32_t) + sizeof(struct ethhdr) + - ip_len; - udp4_l2_iov_tap[i].iov_len = iov_len; - - /* With bigger messages, qemu closes the connection. */ - if (iov_in_msg && msglen + iov_len > SHRT_MAX) { - cur_mh->msg_iovlen = iov_in_msg; - - cur_mh = &udp4_l2_mh_tap[++msg_i].msg_hdr; - msglen = iov_in_msg = 0; - cur_mh->msg_iov = &udp4_l2_iov_tap[i]; - } - - msglen += iov_len; - iov_in_msg++; + udp_sock_fill_data_v4(c, i, ref, + &msg_i, &msg_bufs, &msg_len, now); } + udp4_l2_mh_tap[msg_i].msg_hdr.msg_iovlen = msg_bufs; tap_mmh = udp4_l2_mh_tap; } if (c->mode == MODE_PASTA) return; - cur_mh->msg_iovlen = iov_in_msg; ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1, MSG_NOSIGNAL | MSG_DONTWAIT); if (ret <= 0) @@ -887,25 +909,25 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, * * re-send everything from here: ^-- ----- ------ */ - cur_mh = &tap_mmh[ret - 1].msg_hdr; - for (i = 0, msglen = 0; i < cur_mh->msg_iovlen; i++) { + last_mh = &tap_mmh[ret - 1].msg_hdr; + for (i = 0, msg_len = 0; i < last_mh->msg_iovlen; i++) { if (missing <= 0) { - msglen += cur_mh->msg_iov[i].iov_len; - missing = msglen - tap_mmh[ret - 1].msg_len; + msg_len += last_mh->msg_iov[i].iov_len; + missing = msg_len - tap_mmh[ret - 1].msg_len; } if (missing > 0) { uint8_t **iov_base; int first_offset; - iov_base = (uint8_t **)&cur_mh->msg_iov[i].iov_base; - first_offset = cur_mh->msg_iov[i].iov_len - missing; + iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base; + first_offset = last_mh->msg_iov[i].iov_len - missing; *iov_base += first_offset; - cur_mh->msg_iov[i].iov_len = missing; + last_mh->msg_iov[i].iov_len = missing; - cur_mh->msg_iov = &cur_mh->msg_iov[i]; + last_mh->msg_iov = &last_mh->msg_iov[i]; - sendmsg(c->fd_tap, cur_mh, MSG_NOSIGNAL); + sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL); *iov_base -= first_offset; break; @@ -997,7 +1019,7 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, sa = (struct sockaddr *)&s_in6; sl = sizeof(s_in6); - if (!memcmp(addr, &c->gw6, sizeof(c->gw6)) && !c->no_map_gw) { + if (IN6_ARE_ADDR_EQUAL(addr, &c->gw6) && !c->no_map_gw) { if (!udp_tap_map[V6][dst].ts_local || udp_tap_map[V6][dst].loopback) s_in6.sin6_addr = in6addr_loopback; @@ -1005,7 +1027,7 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, s_in6.sin6_addr = c->addr6; else s_in6.sin6_addr = c->addr6_seen; - } else if (!memcmp(addr, &c->dns6_fwd, sizeof(c->dns6_fwd)) && + } else if (IN6_ARE_ADDR_EQUAL(addr, &c->dns6_fwd) && ntohs(s_in6.sin6_port) == 53) { s_in6.sin6_addr = c->dns6[0]; } else if (IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr)) { -- 2.35.1
Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- dhcpv6.c | 2 +- tap.c | 4 ++-- tcp.c | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/dhcpv6.c b/dhcpv6.c index b79a8e9..375ba79 100644 --- a/dhcpv6.c +++ b/dhcpv6.c @@ -354,7 +354,7 @@ ia_ta: req_addr = (struct in6_addr *)(ia_addr + 1); - if (memcmp(addr, req_addr, sizeof(*addr))) { + if (!IN6_ARE_ADDR_EQUAL(addr, req_addr)) { info("DHCPv6: requested address %s not on link", inet_ntop(AF_INET6, req_addr, buf, sizeof(buf))); diff --git a/tap.c b/tap.c index e1854fb..a1ccfc1 100644 --- a/tap.c +++ b/tap.c @@ -542,8 +542,8 @@ resume: #define L4_MATCH(ip6h, proto, uh, seq) \ (seq->protocol == proto && \ seq->source == uh->source && seq->dest == uh->dest && \ - !memcmp(&seq->saddr, &ip6h->saddr, sizeof(seq->saddr)) && \ - !memcmp(&seq->daddr, &ip6h->daddr, sizeof(seq->daddr))) + IN6_ARE_ADDR_EQUAL(&seq->saddr, &ip6h->saddr) && \ + IN6_ARE_ADDR_EQUAL(&seq->daddr, &ip6h->daddr)) #define L4_SET(ip6h, proto, uh, seq) \ do { \ diff --git a/tcp.c b/tcp.c index 968db97..539d415 100644 --- a/tcp.c +++ b/tcp.c @@ -853,7 +853,7 @@ static int tcp_rtt_dst_low(struct tcp_conn *conn) int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (!memcmp(&conn->a.a6, low_rtt_dst + i, sizeof(conn->a.a6))) + if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i)) return 1; return 0; @@ -874,7 +874,7 @@ static void tcp_rtt_dst_check(struct tcp_conn *conn, struct tcp_info *tinfo) return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (!memcmp(&conn->a.a6, low_rtt_dst + i, sizeof(conn->a.a6))) + if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -1181,7 +1181,7 @@ static int tcp_hash_match(struct tcp_conn *conn, int af, void *addr, return 1; if (af == AF_INET6 && - !memcmp(&conn->a.a6, addr, sizeof(conn->a.a6)) && + IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr) && conn->tap_port == tap_port && conn->sock_port == sock_port) return 1; @@ -2047,7 +2047,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, if (!c->no_map_gw) { if (af == AF_INET && addr4.sin_addr.s_addr == c->gw4) addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - if (af == AF_INET6 && !memcmp(addr, &c->gw6, sizeof(c->gw6))) + if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(addr, &c->gw6)) addr6.sin6_addr = in6addr_loopback; } @@ -2697,8 +2697,8 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, memcpy(&sa6, &sa, sizeof(sa6)); if (IN6_IS_ADDR_LOOPBACK(&sa6.sin6_addr) || - !memcmp(&sa6.sin6_addr, &c->addr6_seen, sizeof(c->gw6)) || - !memcmp(&sa6.sin6_addr, &c->addr6, sizeof(c->gw6))) { + IN6_ARE_ADDR_EQUAL(&sa6.sin6_addr, &c->addr6_seen) || + IN6_ARE_ADDR_EQUAL(&sa6.sin6_addr, &c->addr6)) { struct in6_addr *src; if (IN6_IS_ADDR_LINKLOCAL(&c->gw6)) -- 2.35.1
There's no value in keeping a separate timestamp for activity and for aging of local binds, given that they have the same timeout. Reduce that to a single timestamp, with a flag indicating the local bind. Also use flags instead of separate int fields for loopback and configured unicast address usage as source address. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- udp.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/udp.c b/udp.c index ebbcda1..ad8a775 100644 --- a/udp.c +++ b/udp.c @@ -125,16 +125,16 @@ * struct udp_tap_port - Port tracking based on tap-facing source port * @sock: Socket bound to source port used as index * @ts: Activity timestamp from tap, used for socket aging - * @ts_local: Timestamp of tap packet to gateway address, aging for local bind - * @loopback: Whether local bind maps to loopback address as source - * @gua: Whether local bind maps to configured unicast address as source + * @flags: Flags for local bind, loopback address/unicast address as source */ struct udp_tap_port { int sock; time_t ts; - time_t ts_local; - int loopback; - int gua; + + uint8_t flags; +#define PORT_LOCAL BIT(0) +#define PORT_LOOPBACK BIT(1) +#define PORT_GUA BIT(2) }; /** @@ -684,12 +684,13 @@ static void udp_sock_fill_data_v4(struct ctx *c, int n, union epoll_ref ref, if (src >> IN_CLASSA_NSHIFT == IN_LOOPBACKNET || src == INADDR_ANY || src == ntohl(c->addr4_seen)) { b->iph.saddr = c->gw4; - udp_tap_map[V4][src_port].ts_local = now->tv_sec; + udp_tap_map[V4][src_port].ts = now->tv_sec; + udp_tap_map[V4][src_port].flags |= PORT_LOCAL; if (b->s_in.sin_addr.s_addr == c->addr4_seen) - udp_tap_map[V4][src_port].loopback = 0; + udp_tap_map[V4][src_port].flags &= ~PORT_LOOPBACK; else - udp_tap_map[V4][src_port].loopback = 1; + udp_tap_map[V4][src_port].flags |= PORT_LOOPBACK; bitmap_set(udp_act[V4][UDP_ACT_TAP], src_port); } else if (c->dns4_fwd && @@ -768,17 +769,18 @@ static void udp_sock_fill_data_v6(struct ctx *c, int n, union epoll_ref ref, else b->ip6h.saddr = c->addr6_ll; - udp_tap_map[V6][src_port].ts_local = now->tv_sec; + udp_tap_map[V6][src_port].ts = now->tv_sec; + udp_tap_map[V6][src_port].flags |= PORT_LOCAL; if (IN6_IS_ADDR_LOOPBACK(src)) - udp_tap_map[V6][src_port].loopback = 1; + udp_tap_map[V6][src_port].flags |= PORT_LOOPBACK; else - udp_tap_map[V6][src_port].loopback = 0; + udp_tap_map[V6][src_port].flags &= ~PORT_LOOPBACK; if (IN6_ARE_ADDR_EQUAL(src, &c->addr6)) - udp_tap_map[V6][src_port].gua = 1; + udp_tap_map[V6][src_port].flags |= PORT_GUA; else - udp_tap_map[V6][src_port].gua = 0; + udp_tap_map[V6][src_port].flags &= ~PORT_GUA; bitmap_set(udp_act[V6][UDP_ACT_TAP], src_port); } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->dns6_fwd) && @@ -999,8 +1001,8 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, udp_tap_map[V4][src].ts = now->tv_sec; if (s_in.sin_addr.s_addr == c->gw4 && !c->no_map_gw) { - if (!udp_tap_map[V4][dst].ts_local || - udp_tap_map[V4][dst].loopback) + if (!(udp_tap_map[V4][dst].flags & PORT_LOCAL) || + (udp_tap_map[V4][dst].flags & PORT_LOOPBACK)) s_in.sin_addr.s_addr = htonl(INADDR_LOOPBACK); else s_in.sin_addr.s_addr = c->addr4_seen; @@ -1020,10 +1022,10 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, sl = sizeof(s_in6); if (IN6_ARE_ADDR_EQUAL(addr, &c->gw6) && !c->no_map_gw) { - if (!udp_tap_map[V6][dst].ts_local || - udp_tap_map[V6][dst].loopback) + if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) || + (udp_tap_map[V6][dst].flags & PORT_LOOPBACK)) s_in6.sin6_addr = in6addr_loopback; - else if (udp_tap_map[V6][dst].gua) + else if (udp_tap_map[V6][dst].flags & PORT_GUA) s_in6.sin6_addr = c->addr6; else s_in6.sin6_addr = c->addr6_seen; @@ -1241,13 +1243,9 @@ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, case UDP_ACT_TAP: tp = &udp_tap_map[v6 ? V6 : V4][port]; - if (ts->tv_sec - tp->ts > UDP_CONN_TIMEOUT) + if (ts->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { s = tp->sock; - - if (ts->tv_sec - tp->ts_local > UDP_CONN_TIMEOUT) { - tp->ts_local = 0; - tp->loopback = 0; - tp->gua = 0; + tp->flags = 0; } break; -- 2.35.1
They don't have a measurable performance impact and make things a bit safer. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- Makefile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a2640ff..de3175d 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,7 @@ AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPC64/PPC/') AUDIT_ARCH := $(shell echo $(AUDIT_ARCH) | sed 's/PPCLE/PPC64LE/') CFLAGS += -Wall -Wextra -pedantic -std=c99 -D_XOPEN_SOURCE=700 -D_GNU_SOURCE +CFLAGS += -D_FORTIFY_SOURCE=2 -O2 -pie -fPIE CFLAGS += -DPAGE_SIZE=$(shell getconf PAGE_SIZE) CFLAGS += -DNETNS_RUN_DIR=\"/run/netns\" CFLAGS += -DPASST_AUDIT_ARCH=AUDIT_ARCH_$(AUDIT_ARCH) @@ -64,6 +65,10 @@ ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) CFLAGS += -DHAS_GETRANDOM endif +ifeq ($(shell :|$(CC) -fstack-protector-strong -S -xc - >/dev/null 2>&1; echo $$?),0) + CFLAGS += -fstack-protector-strong +endif + prefix ?= /usr/local ifeq ($(TARGET_ARCH),X86_64) @@ -87,7 +92,8 @@ passt: $(filter-out qrap.c,$(wildcard *.c)) \ passt.avx2: CFLAGS += -Ofast -mavx2 -ftree-vectorize -funroll-loops passt.avx2: $(filter-out qrap.c,$(wildcard *.c)) \ $(filter-out qrap.h,$(wildcard *.h)) seccomp.h - $(CC) $(CFLAGS) $(filter-out qrap.c,$(wildcard *.c)) -o passt.avx2 + $(CC) $(filter-out -O2,$(CFLAGS)) $(filter-out qrap.c,$(wildcard *.c)) \ + -o passt.avx2 passt.avx2: passt @@ -227,7 +233,7 @@ clang-tidy: $(wildcard *.c) $(wildcard *.h) -readability-function-cognitive-complexity,\ -altera-struct-pack-align,\ -concurrency-mt-unsafe \ - --warnings-as-errors=* $(wildcard *.c) -- $(CFLAGS) + --warnings-as-errors=* $(wildcard *.c) -- $(filter-out -pie,$(CFLAGS)) ifeq ($(shell $(CC) -v 2>&1 | grep -c "gcc version"),1) TARGET := $(shell ${CC} -v 2>&1 | sed -n 's/Target: \(.*\)/\1/p') -- 2.35.1
Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- test/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/README.md b/test/README.md index cdf233b..b50c430 100644 --- a/test/README.md +++ b/test/README.md @@ -55,7 +55,7 @@ packages. The following additional packages are commonly needed: - alien linux-perf tshark + alien asciinema linux-perf tshark ## Regular test -- 2.35.1
Pass to seccomp.sh a list of additional syscalls valgrind needs as EXTRA_SYSCALLS in a new 'valgrind' make target, and add corresponding support in seccomp.sh itself. In test setup functions, start passt with valgrind, but not for performance tests. Add tests checking that valgrind exits without errors after all the other tests in the group are done. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- Makefile | 8 +++++++- seccomp.sh | 1 + test/README.md | 2 +- test/lib/setup | 40 +++++++++++++++++++++++++++------------ test/run | 9 +++++++++ test/valgrind.supp | 9 +++++++++ test/valgrind/passt | 22 +++++++++++++++++++++ test/valgrind/passt_in_ns | 22 +++++++++++++++++++++ 8 files changed, 99 insertions(+), 14 deletions(-) create mode 100644 test/valgrind.supp create mode 100644 test/valgrind/passt create mode 100644 test/valgrind/passt_in_ns diff --git a/Makefile b/Makefile index de3175d..e972e88 100644 --- a/Makefile +++ b/Makefile @@ -83,7 +83,7 @@ static: CFLAGS += -static -DGLIBC_NO_STATIC_NSS static: clean all seccomp.h: *.c $(filter-out seccomp.h,$(wildcard *.h)) - @ ./seccomp.sh + @ EXTRA_SYSCALLS=$(EXTRA_SYSCALLS) ./seccomp.sh passt: $(filter-out qrap.c,$(wildcard *.c)) \ $(filter-out qrap.h,$(wildcard *.h)) seccomp.h @@ -108,6 +108,12 @@ qrap: qrap.c passt.h $(CC) $(CFLAGS) \ qrap.c -o qrap +valgrind: EXTRA_SYSCALLS="rt_sigprocmask rt_sigtimedwait rt_sigaction \ + getpid gettid kill clock_gettime mmap munmap open \ + unlink exit_group gettimeofday" +valgrind: CFLAGS:=-g -O0 $(filter-out -O%,$(CFLAGS)) +valgrind: all + .PHONY: clean clean: -${RM} passt passt.avx2 *.o seccomp.h qrap pasta pasta.avx2 pasta.1 \ diff --git a/seccomp.sh b/seccomp.sh index 6ac59a1..74eeb4b 100755 --- a/seccomp.sh +++ b/seccomp.sh @@ -234,6 +234,7 @@ printf '%s\n' "${HEADER}" > "${OUT}" __profiles="$(sed -n 's/[\t ]*\*[\t ]*#syscalls:\([^ ]*\).*/\1/p' *.[ch] | sort -u)" for __p in ${__profiles}; do __calls="$(sed -n 's/[\t ]*\*[\t ]*#syscalls\(:'"${__p}"'\|\)[\t ]\{1,\}\(.*\)/\2/p' *.[ch])" + __calls="${__calls} ${EXTRA_SYSCALLS:-}" __calls="$(filter ${__calls})" echo "seccomp profile ${__p} allows: ${__calls}" | tr '\n' ' ' | fmt -t diff --git a/test/README.md b/test/README.md index b50c430..623e6e5 100644 --- a/test/README.md +++ b/test/README.md @@ -31,7 +31,7 @@ Example for Debian, and possibly most Debian-based distributions: build-essential git jq strace iperf3 qemu-system-x86 tmux sipcalc bc clang-tidy cppcheck isc-dhcp-common udhcpc psmisc linux-cpupower netcat-openbsd fakeroot lz4 lm-sensors qemu-system-arm qemu-system-ppc - qemu-system-misc qemu-system-x86` + qemu-system-misc qemu-system-x86 valgrind ### Other tools diff --git a/test/lib/setup b/test/lib/setup index a39eb80..70b8d6b 100755 --- a/test/lib/setup +++ b/test/lib/setup @@ -51,8 +51,12 @@ setup_passt() { [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" - pane_run PASST "./passt ${__opts} -f -t 10001 -u 10001" - sleep 1 + pane_run PASST "make clean" + pane_wait PASST + pane_run PASST "make valgrind" + pane_wait PASST + pane_run PASST "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f -t 10001 -u 10001 -P passt.pid" + sleep 5 pane_run GUEST './qrap 5 kvm -m '${VMEM}' -cpu host -smp '${VCPUS} \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ @@ -133,7 +137,7 @@ setup_passt_in_ns() { rm "${__pid_file}" pane_run GUEST "nsenter -t ${__ns_pid} -U -n --preserve-credentials" - pane_run NS "nsenter -t ${__ns_pid} -U -n --preserve-credentials" + pane_run NS "nsenter -t ${__ns_pid} -U -n -p --preserve-credentials" pane_wait GUEST pane_wait NS @@ -151,9 +155,20 @@ setup_passt_in_ns() { [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d" [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace" - #pane_run PASST "valgrind --max-stackframe=3000000 ./passt -f ${__opts} -t 10001,10011,10021,10031 -u 10001,10011,10021,10031" - pane_run PASST "./passt -f ${__opts} -t 10001,10011,10021,10031 -u 10001,10011,10021,10031" - sleep 1 + if [ ${VALGRIND} -eq 1 ]; then + pane_run PASST "make clean" + pane_wait PASST + pane_run PASST "make valgrind" + pane_wait PASST + pane_run PASST "valgrind --max-stackframe=$((4 * 1024 * 1024)) --trace-children=yes --vgdb=no --error-exitcode=1 --suppressions=test/valgrind.supp ./passt -f ${__opts} -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P passt.pid" + else + pane_run PASST "make clean" + pane_wait PASST + pane_run PASST "make" + pane_wait PASST + pane_run PASST "./passt -f ${__opts} -t 10001,10011,10021,10031 -u 10001,10011,10021,10031 -P passt.pid" + fi + sleep 5 pane_run GUEST './qrap 5 kvm -m '${VMEM}' -cpu host -smp '${VCPUS} \ ' -kernel ' "/boot/vmlinuz-$(uname -r)" \ @@ -264,12 +279,11 @@ setup_two_guests() { pane_wait GUEST_2 } -# teardown_passt() - Kill qemu and passt +# teardown_passt() - Kill qemu, remove passt PID file teardown_passt() { - tmux send-keys -t ${PANE_PASST} "C-c" - pane_wait PASST tmux send-keys -t ${PANE_GUEST} "C-c" pane_wait GUEST + rm passt.pid } # teardown_passt() - Exit namespace, kill pasta process @@ -280,14 +294,14 @@ teardown_pasta() { pane_wait NS } -# teardown_passt_in_ns() - Exit namespace, kill qemu, passt and pasta +# teardown_passt_in_ns() - Exit namespace, kill qemu and pasta, remove pid file teardown_passt_in_ns() { tmux send-keys -t ${PANE_GUEST} "C-c" pane_wait GUEST tmux send-keys -t ${PANE_GUEST} "C-d" - tmux send-keys -t ${PANE_PASST} "C-c" - pane_wait PASST + [ ${VALGRIND} -eq 0 ] && tmux send-keys -t ${PANE_PASST} "C-c" + [ ${VALGRIND} -eq 0 ] && pane_wait GUEST tmux send-keys -t ${PANE_PASST} "C-d" tmux send-keys -t ${PANE_NS} "C-d" @@ -295,6 +309,8 @@ teardown_passt_in_ns() { pane_wait GUEST pane_wait NS pane_wait PASST + + rm passt.pid } # teardown_two_guests() - Exit namespaces, kill qemu processes, passt and pasta diff --git a/test/run b/test/run index 385267e..b2819ef 100755 --- a/test/run +++ b/test/run @@ -78,14 +78,23 @@ run() { test dhcp test tcp test udp + test valgrind teardown passt + VALGRIND=1 setup passt_in_ns test ndp test dhcp test icmp test tcp test udp + test valgrind + teardown passt_in_ns + + VALGRIND=0 + setup passt_in_ns + test ndp + test dhcp test perf teardown passt_in_ns diff --git a/test/valgrind.supp b/test/valgrind.supp new file mode 100644 index 0000000..7544716 --- /dev/null +++ b/test/valgrind.supp @@ -0,0 +1,9 @@ +# tcp_sock_consume() calls recv() with MSG_TRUNC and no buffer to discard data +{ + passt_recv_MSG_TRUNC_into_NULL_buffer + Memcheck:Param + socketcall.recvfrom(buf) + fun:recv + ... + fun:tcp_sock_consume.isra.0 +} diff --git a/test/valgrind/passt b/test/valgrind/passt new file mode 100644 index 0000000..3af943a --- /dev/null +++ b/test/valgrind/passt @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/valgrind/passt - Terminate passt and check valgrind exit code +# +# Copyright (c) 2022 Red Hat GmbH +# Author: Stefano Brivio <sbrivio(a)redhat.com> + +onlyfor passt +test valgrind: exit code + +hout PASST_PID cat passt.pid +host kill __PASST_PID__ +sleep 1 + +pout VALGRIND_EXIT echo $? +check [ "__VALGRIND_EXIT__" = "0" ] diff --git a/test/valgrind/passt_in_ns b/test/valgrind/passt_in_ns new file mode 100644 index 0000000..bf50c7e --- /dev/null +++ b/test/valgrind/passt_in_ns @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# +# PASST - Plug A Simple Socket Transport +# for qemu/UNIX domain socket mode +# +# PASTA - Pack A Subtle Tap Abstraction +# for network namespace/tap device mode +# +# test/valgrind/passt_in_ns - Terminate passt and check valgrind exit code +# +# Copyright (c) 2022 Red Hat GmbH +# Author: Stefano Brivio <sbrivio(a)redhat.com> + +onlyfor passt_in_ns +test valgrind: exit code + +nsout PASST_PID cat passt.pid +ns kill __PASST_PID__ +sleep 1 + +pout VALGRIND_EXIT echo $? +check [ "__VALGRIND_EXIT__" = "0" ] -- 2.35.1
This should never happen, but there are no formal guarantees: ensure socket numbers are below SOCKET_MAX. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- passt.h | 4 +++- tcp.c | 17 +++++++++++++++++ tcp_splice.c | 8 ++++++++ udp.c | 7 +++++++ util.c | 7 +++++++ 5 files changed, 42 insertions(+), 1 deletion(-) diff --git a/passt.h b/passt.h index 8344fca..3a62b15 100644 --- a/passt.h +++ b/passt.h @@ -45,7 +45,9 @@ union epoll_ref; union epoll_ref { struct { int32_t proto:8, - s:24; +#define SOCKET_REF_BITS 24 +#define SOCKET_MAX (1 << SOCKET_REF_BITS) + s:SOCKET_REF_BITS; union { union tcp_epoll_ref tcp; union udp_epoll_ref udp; diff --git a/tcp.c b/tcp.c index 539d415..f03c929 100644 --- a/tcp.c +++ b/tcp.c @@ -1971,6 +1971,11 @@ static int tcp_conn_new_sock(struct ctx *c, sa_family_t af) if (s < 0) s = socket(af, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (s > SOCKET_MAX) { + close(s); + return -EIO; + } + if (s < 0) return -errno; @@ -2980,6 +2985,12 @@ static int tcp_sock_refill(void *arg) break; } *p4 = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (*p4 > SOCKET_MAX) { + close(*p4); + *p4 = -1; + return -EIO; + } + tcp_sock_set_bufsize(a->c, *p4); } @@ -2989,6 +3000,12 @@ static int tcp_sock_refill(void *arg) } *p6 = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); + if (*p6 > SOCKET_MAX) { + close(*p6); + *p6 = -1; + return -EIO; + } + tcp_sock_set_bufsize(a->c, *p6); } diff --git a/tcp_splice.c b/tcp_splice.c index cb8df7b..d374785 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -418,6 +418,14 @@ static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn, const struct sockaddr *sa; socklen_t sl; + if (sock_conn < 0) + return -errno; + + if (sock_conn > SOCKET_MAX) { + close(sock_conn); + return -EIO; + } + conn->b = sock_conn; if (s < 0) diff --git a/udp.c b/udp.c index ad8a775..e22f3ac 100644 --- a/udp.c +++ b/udp.c @@ -443,8 +443,15 @@ int udp_splice_connect(struct ctx *c, int v6, int bound_sock, s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); + + if (s > SOCKET_MAX) { + close(s); + return -EIO; + } + if (s < 0) return s; + ref.r.s = s; if (v6) { diff --git a/util.c b/util.c index 2d8952a..ff7d97b 100644 --- a/util.c +++ b/util.c @@ -235,10 +235,17 @@ int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, fd = socket(af, SOCK_STREAM | SOCK_NONBLOCK, proto); else fd = socket(af, SOCK_DGRAM | SOCK_NONBLOCK, proto); + if (fd < 0) { perror("L4 socket"); return -1; } + + if (fd > SOCKET_MAX) { + close(fd); + return -EIO; + } + ref.r.s = fd; if (af == AF_INET) { -- 2.35.1
With a lot of concurrent connections, the bitmap scan approach is not really sustainable. Switch to per-connection timerfd timers, set based on events and on two new flags, ACK_FROM_TAP_DUE and ACK_TO_TAP_DUE. Timers are added to the common epoll list, and implement the existing timeouts. While at it, drop the CONN_ prefix from flag names, otherwise they get quite long, and fix the logic to decide if a connection has a local, possibly unreachable endpoint: we shouldn't go through the rest of tcp_conn_from_tap() if we reset the connection due to a successful bind(2), and we'll get EACCES if the port number is low. Suggested by: Stefan Hajnoczi <stefanha(a)redhat.com> Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- README.md | 4 +- passt.c | 12 +- tap.c | 2 +- tcp.c | 498 +++++++++++++++++++++++++++++------------------------- tcp.h | 8 +- 5 files changed, 283 insertions(+), 241 deletions(-) diff --git a/README.md b/README.md index cd4caa3..8e07fb1 100644 --- a/README.md +++ b/README.md @@ -287,11 +287,9 @@ speeding up local connections, and usually requiring NAT. _pasta_: * ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted) * ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached * ✅ no external dependencies (other than a standard C library) -* ✅ restrictive seccomp profiles (22 syscalls allowed for _passt_, 34 for +* ✅ restrictive seccomp profiles (24 syscalls allowed for _passt_, 36 for _pasta_ on x86_64) * ✅ static checkers in continuous integration (clang-tidy, cppcheck) -* 🛠️ rework of TCP state machine (flags instead of states), TCP timers, and code - de-duplication * 🛠️ clearly defined packet abstraction * 🛠️ ~5 000 LoC target * ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests diff --git a/passt.c b/passt.c index 6c04266..6550a22 100644 --- a/passt.c +++ b/passt.c @@ -119,12 +119,12 @@ static void post_handler(struct ctx *c, struct timespec *now) #define CALL_PROTO_HANDLER(c, now, lc, uc) \ do { \ extern void \ - lc ## _defer_handler (struct ctx *, struct timespec *) \ + lc ## _defer_handler (struct ctx *c) \ __attribute__ ((weak)); \ \ if (!c->no_ ## lc) { \ if (lc ## _defer_handler) \ - lc ## _defer_handler(c, now); \ + lc ## _defer_handler(c); \ \ if (timespec_diff_ms((now), &c->lc.timer_run) \= uc ## _TIMER_INTERVAL) { \@@ -134,8 +134,11 @@ static void post_handler(struct ctx *c, struct timespec *now) } \ } while (0) + /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, tcp, TCP); + /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, udp, UDP); + /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ CALL_PROTO_HANDLER(c, now, icmp, ICMP); #undef CALL_PROTO_HANDLER @@ -380,8 +383,8 @@ int main(int argc, char **argv) clock_gettime(CLOCK_MONOTONIC, &now); - if ((!c.no_udp && udp_sock_init(&c, &now)) || - (!c.no_tcp && tcp_sock_init(&c, &now))) + if ((!c.no_udp && udp_sock_init(&c)) || + (!c.no_tcp && tcp_sock_init(&c))) exit(EXIT_FAILURE); proto_update_l2_buf(c.mac_guest, c.mac, &c.addr4); @@ -425,6 +428,7 @@ int main(int argc, char **argv) timer_init(&c, &now); loop: + /* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */ nfds = epoll_wait(c.epollfd, events, EPOLL_EVENTS, TIMER_INTERVAL); if (nfds == -1 && errno != EINTR) { perror("epoll_wait"); diff --git a/tap.c b/tap.c index a1ccfc1..59a87f9 100644 --- a/tap.c +++ b/tap.c @@ -939,7 +939,7 @@ void tap_sock_init(struct ctx *c) * @c: Execution context * @fd: File descriptor where event occurred * @events: epoll events - * @now: Current timestamp + * @now: Current timestamp, can be NULL on EPOLLERR */ void tap_handler(struct ctx *c, int fd, uint32_t events, struct timespec *now) { diff --git a/tcp.c b/tcp.c index f03c929..384e7a6 100644 --- a/tcp.c +++ b/tcp.c @@ -177,32 +177,32 @@ * Aging and timeout * ----------------- * - * Open connections are checked periodically against a number of timeouts. Those - * are: + * Timeouts are implemented by means of timerfd timers, set based on flags: * - * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake within - * this time, reset the connection - * - * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on - * either side, the connection is reset - * - * - ACK_INTERVAL, or zero-sized window advertised to tap/guest: forcibly check - * if an ACK segment can be sent + * - SYN_TIMEOUT: if no ACK is received from tap/guest during handshake (flag + * ACK_FROM_TAP_DUE without ESTABLISHED event) within this time, reset the + * connection * * - ACK_TIMEOUT: if no ACK segment was received from tap/guest, after sending - * data, re-send data from the socket and reset sequence to what was - * acknowledged. If this persists for longer than LAST_ACK_TIMEOUT, reset the - * connection + * data (flag ACK_FROM_TAP_DUE with ESTABLISHED event), re-send data from the + * socket and reset sequence to what was acknowledged. If this persists for + * more than TCP_MAX_RETRANS times in a row, reset the connection * - * - FIN_TIMEOUT, on TAP_FIN_SENT: if no ACK is received for the FIN segment - * within this time, the connection is reset + * - FIN_TIMEOUT: if a FIN segment was sent to tap/guest (flag ACK_FROM_TAP_DUE + * with TAP_FIN_SENT event), and no ACK is received within this time, reset + * the connection * - * - FIN_TIMEOUT, on SOCK_FIN_SENT: if no activity is detected on the socket - * after sending a FIN segment (write shutdown), reset the connection + * - FIN_TIMEOUT: if a FIN segment was acknowledged by tap/guest and a FIN + * segment (write shutdown) was sent via socket (events SOCK_FIN_SENT and + * TAP_FIN_ACKED), but no socket activity is detected from the socket within + * this time, reset the connection * - * - LAST_ACK_TIMEOUT on SOCK_FIN_SENT *and* SOCK_FIN_RCVD: reset the connection - * if no activity was detected on any of the two sides after sending a FIN - * segment + * - ACT_TIMEOUT, in the presence of any event: if no activity is detected on + * either side, the connection is reset + * + * - ACK_INTERVAL elapsed after data segment received from tap without having + * sent an ACK segment, or zero-sized window advertised to tap/guest (flag + * ACK_TO_TAP_DUE): forcibly check if an ACK segment can be sent * * * Summary of data flows (with ESTABLISHED event) @@ -237,11 +237,6 @@ * - on two duplicated ACKs, reset @seq_to_tap to @seq_ack_from_tap, and * resend with steps listed above * - set TCP_WINDOW_CLAMP from TCP header from tap - * - periodically: - * - if @seq_ack_from_tap < @seq_to_tap and the retransmission timer - * (TODO: implement requirements from RFC 6298, currently 3s fixed) from - * @ts_ack_from_tap elapsed, reset @seq_to_tap to @seq_ack_from_tap, and - * resend data with the steps listed above * * - from tap/guest to socket: * - on packet from tap/guest: @@ -287,6 +282,7 @@ #include <sys/random.h> #endif #include <sys/socket.h> +#include <sys/timerfd.h> #include <sys/types.h> #include <sys/uio.h> #include <unistd.h> @@ -328,17 +324,13 @@ # define KERNEL_REPORTS_SND_WND(c) (0 && (c)) #endif -#define SYN_TIMEOUT 240000 /* ms */ -#define ACK_TIMEOUT 2000 -#define ACK_INTERVAL 50 -#define ACT_TIMEOUT 7200000 -#define FIN_TIMEOUT 240000 -#define LAST_ACK_TIMEOUT 240000 +#define ACK_INTERVAL 50 /* ms */ +#define SYN_TIMEOUT 10 /* s */ +#define ACK_TIMEOUT 2 +#define FIN_TIMEOUT 60 +#define ACT_TIMEOUT 7200 #define TCP_SOCK_POOL_TSH 16 /* Refill in ns if > x used */ -#define REFILL_INTERVAL 1000 - -#define PORT_DETECT_INTERVAL 1000 #define LOW_RTT_TABLE_SIZE 8 #define LOW_RTT_THRESHOLD 10 /* us */ @@ -407,7 +399,11 @@ struct tcp_conn; */ struct tcp_conn { struct tcp_conn *next; - int sock; + int32_t sock:SOCKET_REF_BITS; +#define TCP_RETRANS_BITS 3 + unsigned int retrans:TCP_RETRANS_BITS; +#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) + int timer; int hash_bucket; union { @@ -440,11 +436,13 @@ struct tcp_conn { (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) uint8_t flags; -#define CONN_STALLED BIT(0) -#define CONN_LOCAL BIT(1) -#define CONN_WND_CLAMPED BIT(2) -#define CONN_IN_EPOLL BIT(3) -#define CONN_ACTIVE_CLOSE BIT(4) +#define STALLED BIT(0) +#define LOCAL BIT(1) +#define WND_CLAMPED BIT(2) +#define IN_EPOLL BIT(3) +#define ACTIVE_CLOSE BIT(4) +#define ACK_TO_TAP_DUE BIT(5) +#define ACK_FROM_TAP_DUE BIT(6) uint16_t tap_mss; @@ -463,12 +461,6 @@ struct tcp_conn { uint32_t wnd_to_tap; int snd_buf; - - struct timespec ts_sock_act; - struct timespec ts_tap_act; - struct timespec ts_ack_from_tap; - struct timespec ts_ack_to_tap; - struct timespec tap_data_noack; }; #define CONN_IS_CLOSED(conn) (conn->events == CLOSED) @@ -498,6 +490,7 @@ static const char *tcp_state_str[] __attribute((__unused__)) = { static const char *tcp_flag_str[] __attribute((__unused__)) = { "STALLED", "LOCAL", "WND_CLAMPED", "IN_EPOLL", "ACTIVE_CLOSE", + "ACK_TO_TAP_DUE", "ACK_FROM_TAP_DUE", }; /* Port re-mappings as delta, indexed by original destination port */ @@ -686,7 +679,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) if (events & TAP_FIN_SENT) return EPOLLET; - if (conn_flags & CONN_STALLED) + if (conn_flags & STALLED) return EPOLLIN | EPOLLRDHUP | EPOLLET; return EPOLLIN | EPOLLRDHUP; @@ -715,7 +708,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn *conn, */ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn) { - int m = (conn->flags & CONN_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, .r.p.tcp.tcp.index = conn - tc, .r.p.tcp.tcp.v6 = CONN_V6(conn) }; @@ -731,13 +724,69 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn) if (epoll_ctl(c->epollfd, m, conn->sock, &ev)) return -errno; - conn->flags |= CONN_IN_EPOLL; /* No need to log this */ + conn->flags |= IN_EPOLL; /* No need to log this */ return 0; } /** - * conn_flag_do() - Set/unset given flag, log, update epoll on CONN_STALLED + * tcp_timer_ctl() - Set timerfd based on flags/events, create timerfd if needed + * @c: Execution context + * @conn: Connection pointer + * + * #syscalls timerfd_create timerfd_settime + */ +static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn) +{ + struct itimerspec it = { { 0 }, { 0 } }; + + if (conn->timer == -1) { + union epoll_ref ref = { .r.proto = IPPROTO_TCP, + .r.s = conn->sock, + .r.p.tcp.tcp.timer = 1, + .r.p.tcp.tcp.index = conn - tc }; + struct epoll_event ev = { .data.u64 = ref.u64, + .events = EPOLLIN | EPOLLET }; + + conn->timer = timerfd_create(CLOCK_MONOTONIC, 0); + if (conn->timer == -1) { + debug("TCP: failed to get timer: %s", strerror(errno)); + return; + } + + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { + debug("TCP: failed to add timer: %s", strerror(errno)); + close(conn->timer); + conn->timer = -1; + return; + } + } + + if (conn->events == CLOSED) { + it.it_value.tv_nsec = 1; + } else if (conn->flags & ACK_TO_TAP_DUE) { + it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; + } else if (conn->flags & ACK_FROM_TAP_DUE) { + if (!(conn->events & ESTABLISHED)) + it.it_value.tv_sec = SYN_TIMEOUT; + else if (conn->events & TAP_FIN_SENT) + it.it_value.tv_sec = FIN_TIMEOUT; + else + it.it_value.tv_sec = ACK_TIMEOUT; + } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { + it.it_value.tv_sec = FIN_TIMEOUT; + } else { + it.it_value.tv_sec = ACT_TIMEOUT; + } + + debug("TCP: index %i, timer expires in %u.%03us", conn - tc, + it.it_value.tv_sec, it.it_value.tv_nsec / 1000 / 1000); + + timerfd_settime(conn->timer, 0, &it, NULL); +} + +/** + * conn_flag_do() - Set/unset given flag, log, update epoll on STALLED flag * @c: Execution context * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset @@ -761,8 +810,11 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn *conn, tcp_flag_str[fls(flag)]); } - if (flag == CONN_STALLED || flag == ~CONN_STALLED) + if (flag == STALLED || flag == ~STALLED) tcp_epoll_ctl(c, conn); + + if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE) + tcp_timer_ctl(c, conn); } /** @@ -780,7 +832,7 @@ static void conn_event_do(struct ctx *c, struct tcp_conn *conn, return; prev = fls(conn->events); - if (conn->flags & CONN_ACTIVE_CLOSE) + if (conn->flags & ACTIVE_CLOSE) prev += 5; if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) @@ -791,18 +843,13 @@ static void conn_event_do(struct ctx *c, struct tcp_conn *conn, else conn->events |= event; - if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) - conn_flag(c, conn, CONN_ACTIVE_CLOSE); - else - tcp_epoll_ctl(c, conn); - new = fls(conn->events); if ((conn->events & ESTABLISHED) && (conn->events != ESTABLISHED)) { num++; new++; } - if (conn->flags & CONN_ACTIVE_CLOSE) + if (conn->flags & ACTIVE_CLOSE) new += 5; if (prev != new) { @@ -814,6 +861,14 @@ static void conn_event_do(struct ctx *c, struct tcp_conn *conn, debug("TCP: index %i, %s", (conn) - tc, num == -1 ? "CLOSED" : tcp_event_str[num]); } + + if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) + conn_flag(c, conn, ACTIVE_CLOSE); + else + tcp_epoll_ctl(c, conn); + + if (event == CLOSED || CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) + tcp_timer_ctl(c, conn); } #define conn_event(c, conn, event) \ @@ -1388,13 +1443,12 @@ static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn); * * Return: 0 on success, negative error code on failure (tap reset possible) */ -static int tcp_l2_buf_write_one(struct ctx *c, struct iovec *iov, - struct timespec *ts) +static int tcp_l2_buf_write_one(struct ctx *c, struct iovec *iov) { if (write(c->fd_tap, (char *)iov->iov_base + 4, iov->iov_len - 4) < 0) { debug("tap write: %s", strerror(errno)); if (errno != EAGAIN && errno != EWOULDBLOCK) - tap_handler(c, c->fd_tap, EPOLLERR, ts); + tap_handler(c, c->fd_tap, EPOLLERR, NULL); return -errno; } @@ -1431,11 +1485,9 @@ static void tcp_l2_buf_flush_part(struct ctx *c, struct msghdr *mh, size_t sent) * @mh: Message header pointing to buffers, msg_iovlen not set * @buf_used: Pointer to count of used buffers, set to 0 on return * @buf_bytes: Pointer to count of buffer bytes, set to 0 on return - * @ts: Current timestamp */ static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, - unsigned int *buf_used, size_t *buf_bytes, - struct timespec *ts) + unsigned int *buf_used, size_t *buf_bytes) { if (!(mh->msg_iovlen = *buf_used)) return; @@ -1450,7 +1502,7 @@ static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, for (i = 0; i < mh->msg_iovlen; i++) { struct iovec *iov = &mh->msg_iov[i]; - if (tcp_l2_buf_write_one(c, iov, ts)) + if (tcp_l2_buf_write_one(c, iov)) i--; } } @@ -1461,9 +1513,8 @@ static void tcp_l2_buf_flush(struct ctx *c, struct msghdr *mh, /** * tcp_l2_flags_buf_flush() - Send out buffers for segments with no data (flags) * @c: Execution context - * @ts: Current timestamp (not packet timestamp) */ -static void tcp_l2_flags_buf_flush(struct ctx *c, struct timespec *ts) +static void tcp_l2_flags_buf_flush(struct ctx *c) { struct msghdr mh = { 0 }; unsigned int *buf_used; @@ -1472,20 +1523,19 @@ static void tcp_l2_flags_buf_flush(struct ctx *c, struct timespec *ts) mh.msg_iov = tcp6_l2_flags_iov; buf_used = &tcp6_l2_flags_buf_used; buf_bytes = &tcp6_l2_flags_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); mh.msg_iov = tcp4_l2_flags_iov; buf_used = &tcp4_l2_flags_buf_used; buf_bytes = &tcp4_l2_flags_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); } /** * tcp_l2_data_buf_flush() - Send out buffers for segments with data * @c: Execution context - * @ts: Current timestamp (not packet timestamp) */ -static void tcp_l2_data_buf_flush(struct ctx *c, struct timespec *ts) +static void tcp_l2_data_buf_flush(struct ctx *c) { struct msghdr mh = { 0 }; unsigned int *buf_used; @@ -1494,23 +1544,22 @@ static void tcp_l2_data_buf_flush(struct ctx *c, struct timespec *ts) mh.msg_iov = tcp6_l2_iov; buf_used = &tcp6_l2_buf_used; buf_bytes = &tcp6_l2_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); mh.msg_iov = tcp4_l2_iov; buf_used = &tcp4_l2_buf_used; buf_bytes = &tcp4_l2_buf_bytes; - tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes, ts); + tcp_l2_buf_flush(c, &mh, buf_used, buf_bytes); } /** * tcp_defer_handler() - Handler for TCP deferred tasks * @c: Execution context - * @now: Current timestamp */ -void tcp_defer_handler(struct ctx *c, struct timespec *now) +void tcp_defer_handler(struct ctx *c) { - tcp_l2_flags_buf_flush(c, now); - tcp_l2_data_buf_flush(c, now); + tcp_l2_flags_buf_flush(c); + tcp_l2_data_buf_flush(c); } /** @@ -1627,7 +1676,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, conn->seq_ack_to_tap = prev_ack_to_tap; #else if ((unsigned long)conn->snd_buf < SNDBUF_SMALL || tcp_rtt_dst_low(conn) - || CONN_IS_CLOSING(conn) || conn->flags & CONN_LOCAL || force_seq) { + || CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) { conn->seq_ack_to_tap = conn->seq_from_tap; } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { if (!tinfo) { @@ -1660,7 +1709,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, } #ifdef HAS_SND_WND - if ((conn->flags & CONN_LOCAL) || tcp_rtt_dst_low(conn)) { + if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { conn->wnd_to_tap = tinfo->tcpi_snd_wnd; } else { tcp_get_sndbuf(conn); @@ -1670,6 +1719,8 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, conn->wnd_to_tap = MIN(conn->wnd_to_tap, MAX_WINDOW); + if (!conn->wnd_to_tap) + conn_flag(c, conn, ACK_TO_TAP_DUE); out: return conn->wnd_to_tap != prev_wnd_to_tap || conn->seq_ack_to_tap != prev_ack_to_tap; @@ -1680,12 +1731,10 @@ out: * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due - * @now: Current timestamp * * Return: negative error code on connection reset, 0 otherwise */ -static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags, - struct timespec *now) +static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) { uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; uint32_t prev_wnd_to_tap = conn->wnd_to_tap; @@ -1709,7 +1758,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags, return -ECONNRESET; } - if (!(conn->flags & CONN_LOCAL)) + if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) @@ -1748,8 +1797,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags, mss -= sizeof(struct ipv6hdr); if (c->low_wmem && - !(conn->flags & CONN_LOCAL) && - !tcp_rtt_dst_low(conn)) + !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) mss = MIN(mss, PAGE_SIZE); else if (mss > PAGE_SIZE) mss = ROUND_DOWN(mss, PAGE_SIZE); @@ -1795,11 +1843,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags, else tcp6_l2_flags_buf_bytes += iov->iov_len; - if (th->ack && now) - conn->ts_ack_to_tap = *now; + if (th->ack) + conn_flag(c, conn, ~ACK_TO_TAP_DUE); - if (th->fin && now) - conn->tap_data_noack = *now; + if (th->fin) + conn_flag(c, conn, ACK_FROM_TAP_DUE); /* RFC 793, 3.1: "[...] and the first data octet is ISN+1." */ if (th->fin || th->syn) @@ -1814,7 +1862,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags, } if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c, now); + tcp_l2_flags_buf_flush(c); } else { if (flags & DUP_ACK) { memcpy(b6 + 1, b6, sizeof(*b6)); @@ -1824,7 +1872,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags, } if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) - tcp_l2_flags_buf_flush(c, now); + tcp_l2_flags_buf_flush(c); } return 0; @@ -1840,7 +1888,7 @@ static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn) if (CONN_IS_CLOSED(conn)) return; - if (!tcp_send_flag(c, conn, RST, NULL)) + if (!tcp_send_flag(c, conn, RST)) tcp_conn_destroy(c, conn); } @@ -1874,7 +1922,7 @@ static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, window = MIN(MAX_WINDOW, window); - if (conn->flags & CONN_WND_CLAMPED) { + if (conn->flags & WND_CLAMPED) { if (conn->wnd_from_tap == window) return; @@ -1893,7 +1941,7 @@ static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, window = 256; setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &window, sizeof(window)); - conn_flag(c, conn, CONN_WND_CLAMPED); + conn_flag(c, conn, WND_CLAMPED); } } @@ -2070,6 +2118,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, conn = CONN(c->tcp.conn_count++); conn->sock = s; + conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; @@ -2098,9 +2147,6 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, conn->sock_port = ntohs(th->dest); conn->tap_port = ntohs(th->source); - conn->ts_sock_act = conn->ts_tap_act = *now; - conn->ts_ack_to_tap = conn->ts_ack_from_tap = *now; - conn->seq_init_from_tap = ntohl(th->seq); conn->seq_from_tap = conn->seq_init_from_tap + 1; conn->seq_ack_to_tap = conn->seq_from_tap; @@ -2111,10 +2157,12 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, tcp_hash_insert(c, conn, af, addr); - if (!bind(s, sa, sl)) + if (!bind(s, sa, sl)) { tcp_rst(c, conn); /* Nobody is listening then */ - if (errno != EADDRNOTAVAIL) - conn_flag(c, conn, CONN_LOCAL); + return; + } + if (errno != EADDRNOTAVAIL && errno != EACCES) + conn_flag(c, conn, LOCAL); if (connect(s, sa, sl)) { if (errno != EINPROGRESS) { @@ -2126,7 +2174,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, } else { tcp_get_sndbuf(conn); - if (tcp_send_flag(c, conn, SYN | ACK, now)) + if (tcp_send_flag(c, conn, SYN | ACK)) return; conn_event(c, conn, TAP_SYN_ACK_SENT); @@ -2169,7 +2217,7 @@ static int tcp_sock_consume(struct tcp_conn *conn, uint32_t ack_seq) * @now: Current timestamp */ static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, ssize_t plen, - int no_csum, uint32_t seq, struct timespec *now) + int no_csum, uint32_t seq) { struct iovec *iov; size_t len; @@ -2183,7 +2231,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, ssize_t plen, iov = tcp4_l2_iov + tcp4_l2_buf_used++; tcp4_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len); if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) - tcp_l2_data_buf_flush(c, now); + tcp_l2_data_buf_flush(c); } else if (CONN_V6(conn)) { struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; @@ -2192,7 +2240,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, ssize_t plen, iov = tcp6_l2_iov + tcp6_l2_buf_used++; tcp6_l2_buf_bytes += iov->iov_len = len + sizeof(b->vnet_len); if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) - tcp_l2_data_buf_flush(c, now); + tcp_l2_data_buf_flush(c); } } @@ -2200,14 +2248,12 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, ssize_t plen, * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window * @c: Execution context * @conn: Connection pointer - * @now: Current timestamp * * Return: negative on connection reset, 0 otherwise * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn, - struct timespec *now) +static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn) { int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, plen, v4 = CONN_V4(conn); @@ -2225,8 +2271,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn, } if (!conn->wnd_from_tap || already_sent >= conn->wnd_from_tap) { - conn_flag(c, conn, CONN_STALLED); - conn->tap_data_noack = *now; + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } @@ -2248,7 +2294,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn, if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) - tcp_l2_data_buf_flush(c, now); + tcp_l2_data_buf_flush(c); for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) @@ -2274,11 +2320,11 @@ recvmsg: sendlen = len - already_sent; if (sendlen <= 0) { - conn_flag(c, conn, CONN_STALLED); + conn_flag(c, conn, STALLED); return 0; } - conn_flag(c, conn, ~CONN_STALLED); + conn_flag(c, conn, ~STALLED); send_bufs = DIV_ROUND_UP(sendlen, conn->tap_mss); last_len = sendlen - (send_bufs - 1) * conn->tap_mss; @@ -2294,11 +2340,11 @@ recvmsg: if (i == send_bufs - 1) plen = last_len; - tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap, now); + tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap); conn->seq_to_tap += plen; } - conn->tap_data_noack = conn->ts_ack_to_tap = *now; + conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -2312,7 +2358,7 @@ err: zero_len: if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { - if ((ret = tcp_send_flag(c, conn, FIN | ACK, now))) { + if ((ret = tcp_send_flag(c, conn, FIN | ACK))) { tcp_rst(c, conn); return ret; } @@ -2329,13 +2375,11 @@ zero_len: * @conn: Connection pointer * @msg: Array of messages from tap * @count: Count of messages - * @now: Current timestamp * * #syscalls sendmsg */ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, - struct tap_l4_msg *msg, int count, - struct timespec *now) + struct tap_l4_msg *msg, int count) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1; uint32_t max_ack_seq = conn->seq_ack_from_tap; @@ -2445,16 +2489,18 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0); if (ack) { - conn->ts_ack_from_tap = *now; - if (max_ack_seq == conn->seq_to_tap) - conn->tap_data_noack = ((struct timespec) { 0, 0 }); + if (max_ack_seq == conn->seq_to_tap) { + conn_flag(c, conn, ~ACK_FROM_TAP_DUE); + conn->retrans = 0; + } + tcp_sock_consume(conn, max_ack_seq); } if (retr) { conn->seq_ack_from_tap = max_ack_seq; conn->seq_to_tap = max_ack_seq; - tcp_data_from_sock(c, conn, now); + tcp_data_from_sock(c, conn); } if (!iov_i) @@ -2470,14 +2516,14 @@ eintr: * Then swiftly looked away and left. */ conn->seq_from_tap = seq_from_tap; - tcp_send_flag(c, conn, ACK, now); + tcp_send_flag(c, conn, ACK); } if (errno == EINTR) goto eintr; if (errno == EAGAIN || errno == EWOULDBLOCK) { - tcp_send_flag(c, conn, ACK_IF_NEEDED, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED); return; } tcp_rst(c, conn); @@ -2487,7 +2533,7 @@ eintr: if (n < (int)(seq_from_tap - conn->seq_from_tap)) { partial_send = 1; conn->seq_from_tap += n; - tcp_send_flag(c, conn, ACK_IF_NEEDED, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED); } else { conn->seq_from_tap += n; } @@ -2496,7 +2542,7 @@ out: if (keep != -1) { if (conn->seq_dup_ack != conn->seq_from_tap) { conn->seq_dup_ack = conn->seq_from_tap; - tcp_send_flag(c, conn, DUP_ACK, now); + tcp_send_flag(c, conn, DUP_ACK); } return; } @@ -2510,7 +2556,7 @@ out: conn_event(c, conn, TAP_FIN_RCVD); } else { - tcp_send_flag(c, conn, ACK_IF_NEEDED, now); + tcp_send_flag(c, conn, ACK_IF_NEEDED); } } @@ -2520,11 +2566,9 @@ out: * @conn: Connection pointer * @th: TCP header of SYN, ACK segment from tap/guest * @len: Packet length of SYN, ACK segment at L4, host order - * @now: Current timestamp */ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, - struct tcphdr *th, size_t len, - struct timespec *now) + struct tcphdr *th, size_t len) { tcp_clamp_window(c, conn, th, len, 0, 1); conn->tap_mss = tcp_conn_tap_mss(c, conn, th, len); @@ -2538,8 +2582,8 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, conn, now); - tcp_send_flag(c, conn, ACK_IF_NEEDED, now); + tcp_data_from_sock(c, conn); + tcp_send_flag(c, conn, ACK_IF_NEEDED); } /** @@ -2559,6 +2603,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset); uint16_t len = msg[0].l4_len; struct tcp_conn *conn; + int ack_due = 0; conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest)); @@ -2574,13 +2619,17 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, return count; } - conn->ts_tap_act = *now; - conn_flag(c, conn, ~CONN_STALLED); + if (th->ack) { + conn_flag(c, conn, ~ACK_FROM_TAP_DUE); + conn->retrans = 0; + } + + conn_flag(c, conn, ~STALLED); /* Establishing connection from socket */ if (conn->events & SOCK_ACCEPTED) { if (th->syn && th->ack && !th->fin) - tcp_conn_from_sock_finish(c, conn, th, len, now); + tcp_conn_from_sock_finish(c, conn, th, len); else tcp_rst(c, conn); @@ -2600,7 +2649,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, conn->seq_from_tap++; shutdown(conn->sock, SHUT_WR); - tcp_send_flag(c, conn, ACK, now); + tcp_send_flag(c, conn, ACK); conn_event(c, conn, SOCK_FIN_SENT); return count; @@ -2621,11 +2670,6 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, /* Established connections not accepting data from tap */ if (conn->events & TAP_FIN_RCVD) { - if (th->ack) { - conn->tap_data_noack = ((struct timespec) { 0, 0 }); - conn->ts_ack_from_tap = *now; - } - if (conn->events & SOCK_FIN_RCVD && conn->seq_ack_from_tap == conn->seq_to_tap) tcp_conn_destroy(c, conn); @@ -2634,14 +2678,20 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, } /* Established connections accepting data from tap */ - tcp_data_from_tap(c, conn, msg, count, now); + tcp_data_from_tap(c, conn, msg, count); + if (conn->seq_ack_to_tap != conn->seq_from_tap) + ack_due = 1; if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) { shutdown(conn->sock, SHUT_WR); conn_event(c, conn, SOCK_FIN_SENT); - tcp_send_flag(c, conn, ACK, now); + tcp_send_flag(c, conn, ACK); + ack_due = 0; } + if (ack_due) + conn_flag(c, conn, ACK_TO_TAP_DUE); + return count; } @@ -2649,10 +2699,8 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, * tcp_connect_finish() - Handle completion of connect() from EPOLLOUT event * @c: Execution context * @conn: Connection pointer - * @now: Current timestamp */ -static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn, - struct timespec *now) +static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn) { socklen_t sl; int so; @@ -2663,10 +2711,11 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_conn *conn, return; } - if (tcp_send_flag(c, conn, SYN | ACK, now)) + if (tcp_send_flag(c, conn, SYN | ACK)) return; conn_event(c, conn, TAP_SYN_ACK_SENT); + conn_flag(c, conn, ACK_FROM_TAP_DUE); } /** @@ -2693,7 +2742,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn = CONN(c->tcp.conn_count++); conn->sock = s; - + conn->timer = -1; conn_event(c, conn, SOCK_ACCEPTED); if (ref.r.p.tcp.tcp.v6) { @@ -2759,16 +2808,70 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->wnd_from_tap = WINDOW_DEFAULT; - conn->ts_sock_act = conn->ts_tap_act = *now; - conn->ts_ack_from_tap = conn->ts_ack_to_tap = *now; - - tcp_send_flag(c, conn, SYN, now); + tcp_send_flag(c, conn, SYN); + conn_flag(c, conn, ACK_FROM_TAP_DUE); tcp_get_sndbuf(conn); } /** - * tcp_sock_handler() - Handle new data from socket + * tcp_timer_handler() - timerfd events: close, send ACK, retransmit, or reset + * @c: Execution context + * @ref: epoll reference of timer (not connection) + */ +static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) +{ + struct tcp_conn *conn = CONN(ref.r.p.tcp.tcp.index); + struct epoll_event ev = { 0 }; + + if (CONN_IS_CLOSED(conn)) { + tcp_hash_remove(conn); + tcp_table_compact(c, conn); + if (conn->timer != -1) { + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); + close(conn->timer); + conn->timer = -1; + } + } else if (conn->flags & ACK_TO_TAP_DUE) { + tcp_send_flag(c, conn, ACK_IF_NEEDED); + conn_flag(c, conn, ~ACK_TO_TAP_DUE); + } else if (conn->flags & ACK_FROM_TAP_DUE) { + if (!(conn->events & ESTABLISHED)) { + debug("TCP: index %i, handshake timeout", conn - tc); + tcp_rst(c, conn); + } else if (conn->events & TAP_FIN_SENT) { + debug("TCP: index %i, FIN timeout", conn - tc); + tcp_rst(c, conn); + } else if (conn->retrans == TCP_MAX_RETRANS) { + debug("TCP: index %i, maximum retransmissions exceeded", + conn - tc); + tcp_rst(c, conn); + } else { + debug("TCP: index %i, ACK timeout, retry", conn - tc); + conn->retrans++; + conn->seq_to_tap = conn->seq_ack_from_tap; + tcp_data_from_sock(c, conn); + } + } else { + struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; + struct itimerspec old = { { 0 }, { 0 } }; + + /* Activity timeout: if it was already set, reset the + * connection, otherwise, it was a left-over from ACK_TO_TAP_DUE + * or ACK_FROM_TAP_DUE, so just set the long timeout in that + * case. This avoids having to preemptively reset the timer on + * ~ACK_TO_TAP_DUE or ~ACK_FROM_TAP_DUE. + */ + timerfd_settime(conn->timer, 0, &new, &old); + if (old.it_value.tv_sec == ACT_TIMEOUT) { + debug("TCP: index %i, activity timeout", conn - tc); + tcp_rst(c, conn); + } + } +} + +/** + * tcp_sock_handler() - Handle new data from socket, or timerfd event * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap @@ -2779,6 +2882,11 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, { struct tcp_conn *conn; + if (ref.r.p.tcp.tcp.timer) { + tcp_timer_handler(c, ref); + return; + } + if (ref.r.p.tcp.tcp.splice) { tcp_sock_handler_splice(c, ref, events); return; @@ -2792,8 +2900,6 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, if (!(conn = CONN(ref.r.p.tcp.tcp.index))) return; - conn->ts_sock_act = *now; - if (events & EPOLLERR) { tcp_rst(c, conn); return; @@ -2812,7 +2918,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, conn_event(c, conn, SOCK_FIN_RCVD); if (events & EPOLLIN) - tcp_data_from_sock(c, conn, now); + tcp_data_from_sock(c, conn); if (events & EPOLLOUT) tcp_update_seqack_wnd(c, conn, 0, NULL); @@ -2832,7 +2938,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, if (conn->events == TAP_SYN_RCVD) { if (events & EPOLLOUT) - tcp_connect_finish(c, conn, now); + tcp_connect_finish(c, conn); /* Data? Check later */ } } @@ -2981,9 +3087,9 @@ static int tcp_sock_refill(void *arg) } for (i = 0; a->c->v4 && i < TCP_SOCK_POOL_SIZE; i++, p4++) { - if (*p4 >= 0) { + if (*p4 >= 0) break; - } + *p4 = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); if (*p4 > SOCKET_MAX) { close(*p4); @@ -2995,9 +3101,9 @@ static int tcp_sock_refill(void *arg) } for (i = 0; a->c->v6 && i < TCP_SOCK_POOL_SIZE; i++, p6++) { - if (*p6 >= 0) { + if (*p6 >= 0) break; - } + *p6 = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); if (*p6 > SOCKET_MAX) { @@ -3091,72 +3197,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) return 0; } -/** - * tcp_timer_one() - Handler for timed events on one socket - * @c: Execution context - * @conn: Connection pointer - * @ts: Timestamp from caller - */ -static void tcp_timer_one(struct ctx *c, struct tcp_conn *conn, - struct timespec *ts) -{ - int ack_from_tap = timespec_diff_ms(ts, &conn->ts_ack_from_tap); - int ack_to_tap = timespec_diff_ms(ts, &conn->ts_ack_to_tap); - int sock_act = timespec_diff_ms(ts, &conn->ts_sock_act); - int tap_act = timespec_diff_ms(ts, &conn->ts_tap_act); - int tap_data_noack; - - if (!memcmp(&conn->tap_data_noack, &((struct timespec){ 0, 0 }), - sizeof(struct timespec))) - tap_data_noack = 0; - else - tap_data_noack = timespec_diff_ms(ts, &conn->tap_data_noack); - - if (CONN_IS_CLOSED(conn)) { - tcp_hash_remove(conn); - tcp_table_compact(c, conn); - return; - } - - if (!(conn->events & ESTABLISHED)) { - if (ack_from_tap > SYN_TIMEOUT) - tcp_rst(c, conn); - return; - } - - if (tap_act > ACT_TIMEOUT && sock_act > ACT_TIMEOUT) - goto rst; - - if (!conn->wnd_to_tap || ack_to_tap > ACK_INTERVAL) - tcp_send_flag(c, conn, ACK_IF_NEEDED, ts); - - if (tap_data_noack > ACK_TIMEOUT) { - if (conn->seq_ack_from_tap < conn->seq_to_tap) { - if (tap_data_noack > LAST_ACK_TIMEOUT) - goto rst; - - conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn, ts); - } - return; - } - - if (conn->events & TAP_FIN_SENT && tap_data_noack > FIN_TIMEOUT) - goto rst; - - if (conn->events & SOCK_FIN_SENT && sock_act > FIN_TIMEOUT) - goto rst; - - if (conn->events & SOCK_FIN_SENT && conn->events & SOCK_FIN_RCVD) { - if (sock_act > LAST_ACK_TIMEOUT || tap_act > LAST_ACK_TIMEOUT) - goto rst; - } - - return; -rst: - tcp_rst(c, conn); -} - /** * struct tcp_port_detect_arg - Arguments for tcp_port_detect() * @c: Execution context @@ -3281,7 +3321,6 @@ static int tcp_port_rebind(void *arg) void tcp_timer(struct ctx *c, struct timespec *now) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; - int i; if (c->mode == MODE_PASTA) { if (timespec_diff_ms(now, &c->tcp.port_detect_ts) > @@ -3318,7 +3357,4 @@ void tcp_timer(struct ctx *c, struct timespec *now) NS_CALL(tcp_sock_refill, &refill_arg); } } - - for (i = c->tcp.conn_count - 1; i >= 0; i--) - tcp_timer_one(c, CONN(i), now); } diff --git a/tcp.h b/tcp.h index b4e3fde..3154b4b 100644 --- a/tcp.h +++ b/tcp.h @@ -6,7 +6,9 @@ #ifndef TCP_H #define TCP_H -#define TCP_TIMER_INTERVAL 20 /* ms */ +#define REFILL_INTERVAL 1000 /* ms */ +#define PORT_DETECT_INTERVAL 1000 +#define TCP_TIMER_INTERVAL MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL) #define TCP_MAX_CONNS (128 * 1024) #define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2) @@ -21,7 +23,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, struct tap_l4_msg *msg, int count, struct timespec *now); int tcp_sock_init(struct ctx *c, struct timespec *now); void tcp_timer(struct ctx *c, struct timespec *now); -void tcp_defer_handler(struct ctx *c, struct timespec *now); +void tcp_defer_handler(struct ctx *c); void tcp_sock_set_bufsize(struct ctx *c, int s); void tcp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s, @@ -34,6 +36,7 @@ void tcp_remap_to_init(in_port_t port, in_port_t delta); * @listen: Set if this file descriptor is a listening socket * @splice: Set if descriptor is associated to a spliced connection * @v6: Set for IPv6 sockets or connections + * @timer: Reference is a timerfd descriptor for connection * @index: Index of connection in table, or port for bound sockets * @u32: Opaque u32 value of reference */ @@ -42,6 +45,7 @@ union tcp_epoll_ref { uint32_t listen:1, splice:1, v6:1, + timer:1, index:20; } tcp; uint32_t u32; -- 2.35.1
We can't take for granted that the hard limit for open files is big enough as to allow to delay closing sockets to a timer. Store the value of RTLIMIT_NOFILE we set at start, and use it to understand if we're approaching the limit with pending, spliced TCP connections. If that's the case, close sockets right away as soon as they're not needed, instead of deferring this task to a timer. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- passt.c | 2 +- passt.h | 2 ++ tcp.c | 1 + tcp_splice.c | 28 ++++++++++++++++++++++------ tcp_splice.h | 1 + 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/passt.c b/passt.c index 6550a22..292cf53 100644 --- a/passt.c +++ b/passt.c @@ -371,7 +371,7 @@ int main(int argc, char **argv) perror("getrlimit"); exit(EXIT_FAILURE); } - limit.rlim_cur = limit.rlim_max; + c.nofile = limit.rlim_cur = limit.rlim_max; if (setrlimit(RLIMIT_NOFILE, &limit)) { perror("setrlimit"); exit(EXIT_FAILURE); diff --git a/passt.h b/passt.h index 3a62b15..9ea8f8d 100644 --- a/passt.h +++ b/passt.h @@ -98,6 +98,7 @@ enum passt_modes { * @quiet: Don't print informational messages * @foreground: Run in foreground, don't log to stderr by default * @stderr: Force logging to stderr + * @nofile: Maximum number of open files (ulimit -n) * @sock_path: Path for UNIX domain socket * @pcap: Path for packet capture file * @pid_file: Path to PID file, empty string if not configured @@ -160,6 +161,7 @@ struct ctx { int quiet; int foreground; int stderr; + int nofile; char sock_path[UNIX_PATH_MAX]; char pcap[PATH_MAX]; char pid_file[PATH_MAX]; diff --git a/tcp.c b/tcp.c index 384e7a6..2a5bf6e 100644 --- a/tcp.c +++ b/tcp.c @@ -1560,6 +1560,7 @@ void tcp_defer_handler(struct ctx *c) { tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); + tcp_splice_defer_handler(c); } /** diff --git a/tcp_splice.c b/tcp_splice.c index d374785..b7bdfc2 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -52,6 +52,7 @@ #define TCP_SPLICE_MAX_CONNS (128 * 1024) #define TCP_SPLICE_PIPE_POOL_SIZE 16 #define REFILL_INTERVAL 1000 /* ms, refill pool of pipes */ +#define TCP_SPLICE_FILE_PRESSURE 30 /* % of c->nofile */ /* From tcp.c */ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; @@ -152,6 +153,7 @@ static void tcp_splice_conn_epoll_events(uint16_t events, *b |= (events & SPLICE_B_OUT_WAIT) ? EPOLLOUT : 0; } +static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn); /** @@ -832,13 +834,9 @@ void tcp_splice_init(struct ctx *c) */ void tcp_splice_timer(struct ctx *c, struct timespec *now) { - int i; - - for (i = c->tcp.splice_conn_count - 1; i >= 0; i--) { - struct tcp_splice_conn *conn; - - conn = CONN(i); + struct tcp_splice_conn *conn; + for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) { if (conn->flags & SPLICE_CLOSING) { tcp_splice_destroy(c, conn); continue; @@ -865,3 +863,21 @@ void tcp_splice_timer(struct ctx *c, struct timespec *now) if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) tcp_splice_pipe_refill(c); } + +/** + * tcp_splice_defer_handler() - Close connections without timer on file pressure + * @c: Execution context + */ +void tcp_splice_defer_handler(struct ctx *c) +{ + int max_files = c->nofile / 100 * TCP_SPLICE_FILE_PRESSURE; + struct tcp_splice_conn *conn; + + if (c->tcp.splice_conn_count * 6 < max_files) + return; + + for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) { + if (conn->flags & SPLICE_CLOSING) + tcp_splice_destroy(c, conn); + } +} diff --git a/tcp_splice.h b/tcp_splice.h index 45ab1ca..b744ba7 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -12,3 +12,4 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_init(struct ctx *c); void tcp_splice_timer(struct ctx *c, struct timespec *now); +void tcp_splice_defer_handler(struct ctx *c); -- 2.35.1
I didn't have time to investigate the root cause for the virtio_net TX hang yet. Add a quick work-around for the moment being. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- test/perf/passt_tcp | 15 +++++++++++++++ test/perf/passt_udp | 15 +++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/test/perf/passt_tcp b/test/perf/passt_tcp index 99d8fb1..f13fdfc 100644 --- a/test/perf/passt_tcp +++ b/test/perf/passt_tcp @@ -16,6 +16,11 @@ gtools sysctl ip jq nproc seq sleep bc iperf3 tcp_rr tcp_crr # From neper nstools sysctl ip jq nproc seq sleep bc iperf3 tcp_rr tcp_crr htools bc head sed seq +# In this setup, virtio_net TX queue sometimes hangs, still under investigation +def virtio_net_workaround +guest modprobe -r virtio_net; modprobe virtio_net napi_tx=1; dhclient; dhclient -6; sleep 3 +endef + test passt: throughput and latency guest /sbin/sysctl -w net.core.rmem_max=536870912 @@ -52,18 +57,22 @@ tr TCP throughput over IPv6: guest to host bw - bw - +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 1280 iperf3c guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __OPTS__ -w 4M iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 1.2 1.5 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 1500 iperf3c guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __OPTS__ -w 4M iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 1.6 1.8 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 9000 iperf3c guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __OPTS__ -w 8M iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 4.0 5.0 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 65520 iperf3c guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __OPTS__ -w 16M iperf3s BW ns 100${i}2 __THREADS__ @@ -91,26 +100,32 @@ lat __LAT__ 500 400 tr TCP throughput over IPv4: guest to host +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 256 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -w 1M iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 0.2 0.3 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 576 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -w 1M iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 0.5 0.8 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 1280 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -w 4M iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 1.2 1.5 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 1500 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -w 4M iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 1.6 1.8 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 9000 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -w 8M iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 4.0 5.0 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 65520 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -w 16M iperf3s BW ns 100${i}2 __THREADS__ diff --git a/test/perf/passt_udp b/test/perf/passt_udp index ff4c73a..cc5711b 100644 --- a/test/perf/passt_udp +++ b/test/perf/passt_udp @@ -16,6 +16,11 @@ gtools sysctl ip jq nproc sleep iperf3 udp_rr # From neper nstools ip jq sleep iperf3 udp_rr htools bc head sed +# In this setup, virtio_net TX queue sometimes hangs, still under investigation +def virtio_net_workaround +guest modprobe -r virtio_net; modprobe virtio_net napi_tx=1; dhclient; dhclient -6; sleep 3 +endef + test passt: throughput and latency guest /sbin/sysctl -w net.core.rmem_max=16777216 @@ -45,18 +50,22 @@ th MTU 256B 576B 1280B 1500B 9000B 65520B tr UDP throughput over IPv6: guest to host bw - bw - +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 1280 iperf3c guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __OPTS__ -b 2G iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 0.8 1.2 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 1500 iperf3c guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __OPTS__ -b 3G iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 1.0 1.5 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 9000 iperf3c guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __OPTS__ -b 5G iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 4.0 5.0 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 65520 iperf3c guest __GW6__%__IFNAME__ 100${i}2 __THREADS__ __OPTS__ -b 7G iperf3s BW ns 100${i}2 __THREADS__ @@ -74,26 +83,32 @@ lat __LAT__ 200 150 tr UDP throughput over IPv4: guest to host +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 256 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -b 500M iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 0.0 0.0 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 576 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -b 1G iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 0.4 0.6 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 1280 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -b 2G iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 0.8 1.2 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 1500 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -b 3G iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 1.0 1.5 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 9000 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -b 6G iperf3s BW ns 100${i}2 __THREADS__ bw __BW__ 4.0 5.0 +virtio_net_workaround guest ip link set dev __IFNAME__ mtu 65520 iperf3c guest __GW__ 100${i}2 __THREADS__ __OPTS__ -b 7G iperf3s BW ns 100${i}2 __THREADS__ -- 2.35.1
They look a bit lame: rephrase sentences to avoid them. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- README.md | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 8e07fb1..632ddd1 100644 --- a/README.md +++ b/README.md @@ -221,8 +221,9 @@ is needed as _passt_ runs without the `CAP_NET_RAW` capability: it can't create raw IP sockets on the pod, and therefore needs to map packets at Layer-2 to Layer-4 sockets offered by the host kernel. -The problem and this approach are illustrated in more detail, with diagrams, -[here](https://gitlab.com/abologna/kubevirt-and-kvm/-/blob/master/Networking.md). +See also a +[detailed illustration](https://gitlab.com/abologna/kubevirt-and-kvm/-/blob/master/Ne… +of the problem and what lead to this approach. ### pasta @@ -294,8 +295,8 @@ speeding up local connections, and usually requiring NAT. _pasta_: * 🛠️ ~5 000 LoC target * ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests * ⌚ stricter [synflood protection](https://bugs.passt.top/show_bug.cgi?id=10) -* 💡 [your](https://lists.passt.top/) [ideas](https://bugs.passt.top/) - [here](https://chat.passt.top) +* 💡 [add](https://lists.passt.top/) [your](https://bugs.passt.top/) + [ideas](https://chat.passt.top) ### Configurability * ✅ all addresses, ports, port ranges @@ -304,8 +305,8 @@ speeding up local connections, and usually requiring NAT. _pasta_: * ✅ _pasta_: auto-detection of bound ports * 🛠 run-time configuration of port ranges without autodetection * 🛠 configuration of port ranges for autodetection -* 💡 [your](https://lists.passt.top/) [ideas](https://bugs.passt.top/) - [here](https://chat.passt.top) +* 💡 [add](https://lists.passt.top/) [your](https://bugs.passt.top/) + [ideas](https://chat.passt.top) ### Performance * ✅ maximum two (cache hot) copies on every data path @@ -416,9 +417,9 @@ destination address translated to the loopback address. ## Protocols _passt_ and _pasta_ support TCP, UDP and ICMP/ICMPv6 echo (requests and -replies). More details about the TCP implementation are available -[here](/passt/tree/tcp.c), and for the UDP -implementation [here](/passt/tree/udp.c). +replies). More details about the TCP implementation are described in the +[theory of operation](/passt/tree/tcp.c), and similarly for +[UDP](/passt/tree/udp.c). An IGMP/MLD proxy is currently work in progress. @@ -506,7 +507,7 @@ if (getComputedStyle(document.getElementById('ci'))['visibility'] == "visible") <p><a href="/builds/latest/web/ci.html">Continuous integration test run</a></p> </div> -Test logs [here](/builds/latest/test/). +See also the [test logs](/builds/latest/test/). ## Performance @@ -522,11 +523,10 @@ Test logs [here](/builds/latest/test/). cd passt make - * alternatively, static builds for x86_64 as of the latest commit are also - available for convenience [here](/builds/latest/x86_64/). Convenience, - non-official packages for Debian (and derivatives) and RPM-based - distributions are also available there. These binaries and packages are - simply built with: + * alternatively, [static builds]((/builds/latest/x86_64/) for x86_64 as of + the latest commit are also available for convenience. Non-official + packages for Debian (and derivatives) and RPM-based distributions are also + available there. These binaries and packages are simply built with: make pkgs @@ -581,11 +581,10 @@ Test logs [here](/builds/latest/test/). cd passt make - * alternatively, static builds for x86_64 as of the latest commit are also - available for convenience [here](/builds/latest/x86_64/). Convenience, - non-official packages for Debian (and derivatives) and RPM-based - distributions are also available there. These binaries and packages are - simply built with: + * alternatively, [static builds]((/builds/latest/x86_64/) for x86_64 as of + the latest commit are also available for convenience. Non-official + packages for Debian (and derivatives) and RPM-based distributions are also + available there. These binaries and packages are simply built with: make pkgs -- 2.35.1
Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 632ddd1..1ef3a73 100644 --- a/README.md +++ b/README.md @@ -325,19 +325,24 @@ speeding up local connections, and usually requiring NAT. _pasta_: ### Interfaces * ✅ qemu, libvirt support with [`qrap` wrapper](/passt/tree/qrap.c) -* ✅ out-of-tree patches for [qemu](/passt/tree/qemu) and - [libvirt](/passt/tree/libvirt) available -* 🛠 bug-to-bug compatible +* ✅ out-of-tree patches for [qemu](/passt/tree/contrib/qemu) and + [libvirt](/passt/tree/contrib/libvirt) available +* ✅ bug-to-bug compatible [_slirp4netns_ replacement](/passt/tree/slirp4netns.sh) (rootless Podman, RootlessKit) +* ✅ out-of-tree patch for [Podman](/passt/tree/contrib/podman) available +* ✅ out-of-tree patch for + [Kata Containers](/passt/tree/contrib/kata-containers) available * 🛠 native [qemu](https://bugs.passt.top/show_bug.cgi?id=11), [libvirt](https://bugs.passt.top/show_bug.cgi?id=12) support +* 🛠 native Podman integration * ⌚ drop-in replacement for VPNKit (rootless Docker) ### Availability * ✅ convenience unofficial packages for Debian, RPM-based distributions on x86_64 (static builds) -* ✅ testing on non-x86 architectures +* ✅ testing on non-x86_64 architectures (aarch64, armv7l, i386, ppc64, ppc64le, + s390x) * 🛠 official [OpenSUSE packages](https://build.opensuse.org/package/show/home:mnhauke/passt) * ⌚ packages for Debian, Fedora, etc. -- 2.35.1
...by: - storing the chained-hash next connection pointer as numeric reference rather than as pointer - storing the MSS as 14-bit value, and rounding it - using only the effective amount of bits needed to store the hash bucket number - explicitly limiting window scaling factors to 4-bit values (maximum factor is 14, from RFC 7323) - scaling SO_SNDBUF values, and using a 8-bit representation for the duplicate ACK sequence - keeping window values unscaled, as received and sent Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- tcp.c | 303 ++++++++++++++++++++++++++++++++-------------------------- tcp.h | 5 +- 2 files changed, 170 insertions(+), 138 deletions(-) diff --git a/tcp.c b/tcp.c index 2a5bf6e..3f61e6a 100644 --- a/tcp.c +++ b/tcp.c @@ -66,7 +66,7 @@ * ------ * * To avoid the need for dynamic memory allocation, a maximum, reasonable amount - * of connections is defined by MAX_TAP_CONNS below (currently 128k). + * of connections is defined by TCP_MAX_CONNS (currently 128k). * * Data needs to linger on sockets as long as it's not acknowledged by the * guest, and is read using MSG_PEEK into preallocated static buffers sized @@ -216,8 +216,8 @@ * @seq_init_from_tap: initial sequence number from tap/guest * @seq_init_to_tap: initial sequence number from tap/guest * - * @wnd_from_tap: last window size received from tap, scaled - * @wnd_from_tap: last window size advertised from tap, scaled + * @wnd_from_tap: last window size received from tap, never scaled + * @wnd_from_tap: last window size advertised from tap, never scaled * * - from socket to tap/guest: * - on new data from socket: @@ -299,23 +299,26 @@ #include "conf.h" #include "tcp_splice.h" -#define MAX_TAP_CONNS (128 * 1024) - #define TCP_FRAMES_MEM 256 #define TCP_FRAMES \ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) +#define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1) #define TCP_HASH_TABLE_LOAD 70 /* % */ -#define TCP_HASH_TABLE_SIZE (MAX_TAP_CONNS * 100 / \ +#define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \ TCP_HASH_TABLE_LOAD) #define MAX_WS 10 #define MAX_WINDOW (1 << (16 + (MAX_WS))) + +/* MSS rounding: see SET_MSS() */ #define MSS_DEFAULT 536 -#define MSS4 (USHRT_MAX - sizeof(uint32_t) - sizeof(struct ethhdr) - \ - sizeof(struct iphdr) - sizeof(struct tcphdr)) -#define MSS6 (USHRT_MAX - sizeof(uint32_t) - sizeof(struct ethhdr) - \ - sizeof(struct ipv6hdr) - sizeof(struct tcphdr)) +#define MSS4 ROUND_DOWN(USHRT_MAX - \ + sizeof(uint32_t) - sizeof(struct ethhdr) - \ + sizeof(struct iphdr) - sizeof(struct tcphdr), 4) +#define MSS6 ROUND_DOWN(USHRT_MAX - \ + sizeof(uint32_t) - sizeof(struct ethhdr) - \ + sizeof(struct ipv6hdr) - sizeof(struct tcphdr), 4) #define WINDOW_DEFAULT 14600 /* RFC 6928 */ #ifdef HAS_SND_WND @@ -363,64 +366,46 @@ #define OPT_SACK 5 #define OPT_TS 8 -struct tcp_conn; - /** * struct tcp_conn - Descriptor for a TCP connection (not spliced) - * @next: Pointer to next item in hash chain, if any + * @next_index: Connection index of next item in hash chain, -1 for none + * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS * @sock: Socket descriptor number + * @events: Connection events, implying connection states + * @timer: timerfd descriptor for timeout events + * @flags: Connection flags representing internal attributes * @hash_bucket: Bucket index in connection lookup hash table + * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT + * @ws_from_tap: Window scaling factor advertised from tap/guest + * @ws_to_tap: Window scaling factor advertised to tap/guest + * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS + * @seq_dup_ack_approx: Last duplicate ACK number sent to tap * @a.a6: IPv6 remote address, can be IPv4-mapped * @a.a4.zero: Zero prefix for IPv4-mapped, see RFC 6890, Table 20 * @a.a4.one: Ones prefix for IPv4-mapped * @a.a4.a: IPv4 address * @tap_port: Guest-facing tap port * @sock_port: Remote, socket-facing port - * @events: Connection events, implying connection states - * @flags: Connection flags representing internal attributes - * @tap_mss: Maximum segment size advertised by guest + * @wnd_from_tap: Last window size from tap, unscaled (as received) + * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap - * @seq_dup_ack: Last duplicate ACK number sent to tap * @seq_init_from_tap: Initial sequence number from tap - * @seq_init_from_tap: Initial sequence number to tap - * @ws_tap: Window scaling factor from tap - * @ws: Window scaling factor - * @wnd_from_tap: Last window size received from tap, scaled - * @wnd_to_tap: Socket-side sending window, advertised to tap - * @snd_buf: Socket sending buffer reported by kernel, in bytes - * @ts_sock_act: Last activity timestamp from socket for timeout purposes - * @ts_tap_act: Last activity timestamp from tap for timeout purposes - * @ts_ack_from_tap: Last ACK segment timestamp from tap - * @ts_ack_to_tap: Last ACK segment timestamp to tap - * @tap_data_noack: Last unacked data to tap, set to { 0, 0 } on ACK */ struct tcp_conn { - struct tcp_conn *next; - int32_t sock:SOCKET_REF_BITS; -#define TCP_RETRANS_BITS 3 - unsigned int retrans:TCP_RETRANS_BITS; -#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) - int timer; - int hash_bucket; + int32_t next_index :TCP_CONN_INDEX_BITS + 1; + +#define TCP_MSS_BITS 14 + uint16_t tap_mss :TCP_MSS_BITS; +#define MSS_SET(conn, mss) (conn->tap_mss = (mss >> (16 - TCP_MSS_BITS))) +#define MSS_GET(conn) (conn->tap_mss << (16 - TCP_MSS_BITS)) - union { - struct in6_addr a6; - struct { - uint8_t zero[10]; - uint8_t one[2]; - struct in_addr a; - } a4; - } a; -#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6) -#define CONN_V6(conn) (!CONN_V4(conn)) - in_port_t tap_port; - in_port_t sock_port; + int32_t sock :SOCKET_REF_BITS; - uint8_t events; + uint8_t events; #define CLOSED 0 #define SOCK_ACCEPTED BIT(0) /* implies SYN sent to tap */ #define TAP_SYN_RCVD BIT(1) /* implies socket connecting */ @@ -435,7 +420,10 @@ struct tcp_conn { #define CONN_STATE_BITS /* Setting these clears other flags */ \ (SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED) - uint8_t flags; + + int32_t timer :SOCKET_REF_BITS; + + uint8_t flags; #define STALLED BIT(0) #define LOCAL BIT(1) #define WND_CLAMPED BIT(2) @@ -444,23 +432,48 @@ struct tcp_conn { #define ACK_TO_TAP_DUE BIT(5) #define ACK_FROM_TAP_DUE BIT(6) - uint16_t tap_mss; - uint32_t seq_to_tap; - uint32_t seq_ack_from_tap; - uint32_t seq_from_tap; - uint32_t seq_ack_to_tap; - uint32_t seq_dup_ack; - uint32_t seq_init_from_tap; - uint32_t seq_init_to_tap; + uint32_t hash_bucket :TCP_HASH_BUCKET_BITS; + +#define TCP_RETRANS_BITS 3 + unsigned int retrans :TCP_RETRANS_BITS; +#define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) - uint16_t ws_tap; - uint16_t ws; +#define TCP_WS_BITS 4 /* RFC 7323 */ + uint8_t ws_from_tap :TCP_WS_BITS; + uint8_t ws_to_tap :TCP_WS_BITS; - uint32_t wnd_from_tap; - uint32_t wnd_to_tap; - int snd_buf; +#define SNDBUF_BITS 24 + uint32_t sndbuf :SNDBUF_BITS; +#define SNDBUF_SET(conn, bytes) (conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS))) +#define SNDBUF_GET(conn) (conn->sndbuf << (32 - SNDBUF_BITS)) + + uint8_t seq_dup_ack_approx; + + + union { + struct in6_addr a6; + struct { + uint8_t zero[10]; + uint8_t one[2]; + struct in_addr a; + } a4; + } a; +#define CONN_V4(conn) IN6_IS_ADDR_V4MAPPED(&conn->a.a6) +#define CONN_V6(conn) (!CONN_V4(conn)) + + in_port_t tap_port; + in_port_t sock_port; + + uint16_t wnd_from_tap; + uint16_t wnd_to_tap; + + uint32_t seq_to_tap; + uint32_t seq_ack_from_tap; + uint32_t seq_from_tap; + uint32_t seq_ack_to_tap; + uint32_t seq_init_from_tap; }; #define CONN_IS_CLOSED(conn) (conn->events == CLOSED) @@ -471,6 +484,12 @@ struct tcp_conn { #define CONN(index) (tc + (index)) +/* We probably don't want to use gcc statement expressions (for portability), so + * use this only after well-defined sequence points (no pre-/post-increments). + */ +#define CONN_OR_NULL(index) \ + (((index) >= 0 && (index) < TCP_MAX_CONNS) ? (tc + (index)) : NULL) + static const char *tcp_event_str[] __attribute((__unused__)) = { "SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT", @@ -652,7 +671,7 @@ static unsigned int tcp6_l2_flags_buf_used; static size_t tcp6_l2_flags_buf_bytes; /* TCP connections */ -static struct tcp_conn tc[MAX_TAP_CONNS]; +static struct tcp_conn tc[TCP_MAX_CONNS]; /* Table for lookup from remote address, local port, remote port */ static struct tcp_conn *tc_hash[TCP_HASH_TABLE_SIZE]; @@ -747,12 +766,14 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn) .r.p.tcp.tcp.index = conn - tc }; struct epoll_event ev = { .data.u64 = ref.u64, .events = EPOLLIN | EPOLLET }; + int fd; - conn->timer = timerfd_create(CLOCK_MONOTONIC, 0); - if (conn->timer == -1) { + fd = timerfd_create(CLOCK_MONOTONIC, 0); + if (fd == -1 || fd > SOCKET_MAX) { debug("TCP: failed to get timer: %s", strerror(errno)); return; } + conn->timer = fd; if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, conn->timer, &ev)) { debug("TCP: failed to add timer: %s", strerror(errno)); @@ -957,7 +978,7 @@ static void tcp_get_sndbuf(struct tcp_conn *conn) sl = sizeof(sndbuf); if (getsockopt(s, SOL_SOCKET, SO_SNDBUF, &sndbuf, &sl)) { - conn->snd_buf = WINDOW_DEFAULT; + SNDBUF_SET(conn, WINDOW_DEFAULT); return; } @@ -967,7 +988,7 @@ static void tcp_get_sndbuf(struct tcp_conn *conn) else if (v > SNDBUF_SMALL) v -= v * (v - SNDBUF_SMALL) / (SNDBUF_BIG - SNDBUF_SMALL) / 2; - conn->snd_buf = MIN(INT_MAX, v); + SNDBUF_SET(conn, MIN(INT_MAX, v)); } /** @@ -1299,12 +1320,12 @@ static void tcp_hash_insert(struct ctx *c, struct tcp_conn *conn, int b; b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port); - conn->next = tc_hash[b]; + conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1; tc_hash[b] = conn; conn->hash_bucket = b; debug("TCP: hash table insert: index %i, sock %i, bucket: %i, next: %p", - conn - tc, conn->sock, b, conn->next); + conn - tc, conn->sock, b, CONN_OR_NULL(conn->next_index)); } /** @@ -1316,18 +1337,20 @@ static void tcp_hash_remove(struct tcp_conn *conn) struct tcp_conn *entry, *prev = NULL; int b = conn->hash_bucket; - for (entry = tc_hash[b]; entry; prev = entry, entry = entry->next) { + for (entry = tc_hash[b]; entry; + prev = entry, entry = CONN_OR_NULL(entry->next_index)) { if (entry == conn) { if (prev) - prev->next = conn->next; + prev->next_index = conn->next_index; else - tc_hash[b] = conn->next; + tc_hash[b] = CONN_OR_NULL(conn->next_index); break; } } debug("TCP: hash table remove: index %i, sock %i, bucket: %i, new: %p", - conn - tc, conn->sock, b, prev ? prev->next : tc_hash[b]); + conn - tc, conn->sock, b, + prev ? CONN_OR_NULL(prev->next_index) : tc_hash[b]); } /** @@ -1340,10 +1363,11 @@ static void tcp_hash_update(struct tcp_conn *old, struct tcp_conn *new) struct tcp_conn *entry, *prev = NULL; int b = old->hash_bucket; - for (entry = tc_hash[b]; entry; prev = entry, entry = entry->next) { + for (entry = tc_hash[b]; entry; + prev = entry, entry = CONN_OR_NULL(entry->next_index)) { if (entry == old) { if (prev) - prev->next = new; + prev->next_index = new - tc; else tc_hash[b] = new; break; @@ -1371,7 +1395,7 @@ static struct tcp_conn *tcp_hash_lookup(struct ctx *c, int af, void *addr, int b = tcp_hash(c, af, addr, tap_port, sock_port); struct tcp_conn *conn; - for (conn = tc_hash[b]; conn; conn = conn->next) { + for (conn = tc_hash[b]; conn; conn = CONN_OR_NULL(conn->next_index)) { if (tcp_hash_match(conn, af, addr, tap_port, sock_port)) return conn; } @@ -1586,21 +1610,11 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn, b->th.dest = htons(conn->tap_port); \ b->th.seq = htonl(seq); \ b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ - \ - /* First value sent by receiver is not scaled */ \ - if (b->th.syn) { \ - b->th.window = htons(MIN(conn->wnd_to_tap, \ - USHRT_MAX)); \ - } else { \ - b->th.window = htons(MIN(conn->wnd_to_tap >> \ - conn->ws, \ - USHRT_MAX)); \ - } \ + b->th.window = htons(MIN(conn->wnd_to_tap, USHRT_MAX)); \ } while (0) if (CONN_V6(conn)) { struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; - uint32_t flow = conn->seq_init_to_tap; ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); @@ -1617,9 +1631,9 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn, tcp_update_check_tcp6(b); - b->ip6h.flow_lbl[0] = (flow >> 16) & 0xf; - b->ip6h.flow_lbl[1] = (flow >> 8) & 0xff; - b->ip6h.flow_lbl[2] = (flow >> 0) & 0xff; + b->ip6h.flow_lbl[0] = (conn->sock >> 16) & 0xf; + b->ip6h.flow_lbl[1] = (conn->sock >> 8) & 0xff; + b->ip6h.flow_lbl[2] = (conn->sock >> 0) & 0xff; eth_len = ip_len + sizeof(struct ethhdr); if (c->mode == MODE_PASST) @@ -1663,10 +1677,11 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn, static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, int force_seq, struct tcp_info *tinfo) { + uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap; uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; - uint32_t prev_wnd_to_tap = conn->wnd_to_tap; socklen_t sl = sizeof(*tinfo); struct tcp_info tinfo_new; + uint32_t new_wnd_to_tap = prev_wnd_to_tap; int s = conn->sock; #ifndef HAS_BYTES_ACKED @@ -1676,7 +1691,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, if (SEQ_LT(conn->seq_ack_to_tap, prev_ack_to_tap)) conn->seq_ack_to_tap = prev_ack_to_tap; #else - if ((unsigned long)conn->snd_buf < SNDBUF_SMALL || tcp_rtt_dst_low(conn) + if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn) || CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) { conn->seq_ack_to_tap = conn->seq_from_tap; } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { @@ -1696,12 +1711,13 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, if (!KERNEL_REPORTS_SND_WND(c)) { tcp_get_sndbuf(conn); - conn->wnd_to_tap = MIN(conn->snd_buf, MAX_WINDOW); + new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); + conn->wnd_to_tap = new_wnd_to_tap >> conn->ws_to_tap; goto out; } if (!tinfo) { - if (conn->wnd_to_tap > WINDOW_DEFAULT) + if (prev_wnd_to_tap > WINDOW_DEFAULT) goto out; tinfo = &tinfo_new; @@ -1711,19 +1727,20 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, #ifdef HAS_SND_WND if ((conn->flags & LOCAL) || tcp_rtt_dst_low(conn)) { - conn->wnd_to_tap = tinfo->tcpi_snd_wnd; + new_wnd_to_tap = tinfo->tcpi_snd_wnd; } else { tcp_get_sndbuf(conn); - conn->wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, conn->snd_buf); + new_wnd_to_tap = MIN((int)tinfo->tcpi_snd_wnd, + SNDBUF_GET(conn)); } #endif - conn->wnd_to_tap = MIN(conn->wnd_to_tap, MAX_WINDOW); + conn->wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW) >> conn->ws_to_tap; if (!conn->wnd_to_tap) conn_flag(c, conn, ACK_TO_TAP_DUE); out: - return conn->wnd_to_tap != prev_wnd_to_tap || + return new_wnd_to_tap != prev_wnd_to_tap || conn->seq_ack_to_tap != prev_ack_to_tap; } @@ -1813,16 +1830,14 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) c->tcp.kernel_snd_wnd = 1; #endif - conn->ws = MIN(MAX_WS, tinfo.tcpi_snd_wscale); + conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale); *data++ = OPT_NOP; *data++ = OPT_WS; *data++ = OPT_WS_LEN; - *data++ = conn->ws; + *data++ = conn->ws_to_tap; th->ack = !!(flags & ACK); - - conn->wnd_to_tap = WINDOW_DEFAULT; } else { th->ack = !!(flags & (ACK | DUP_ACK)) || conn->seq_ack_to_tap != prev_ack_to_tap || @@ -1839,6 +1854,10 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) NULL, conn->seq_to_tap); iov->iov_len = eth_len + sizeof(uint32_t); + /* First value is not scaled: scale now */ + if (flags & SYN) + conn->wnd_to_tap >>= conn->ws_to_tap; + if (CONN_V4(conn)) tcp4_l2_flags_buf_bytes += iov->iov_len; else @@ -1908,7 +1927,7 @@ static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, if (init && th) { int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); - conn->ws_tap = ws; + conn->ws_from_tap = ws & 0xf; /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp * yet, to avoid getting a zero scale just because we set a @@ -1916,30 +1935,34 @@ static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, */ conn->wnd_from_tap = ntohs(th->window); } else { + uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap; + if (th) - window = ntohs(th->window) << conn->ws_tap; + window = ntohs(th->window) << conn->ws_from_tap; else - window <<= conn->ws_tap; + window <<= conn->ws_from_tap; window = MIN(MAX_WINDOW, window); if (conn->flags & WND_CLAMPED) { - if (conn->wnd_from_tap == window) + if (prev_scaled == window) return; /* Discard +/- 1% updates to spare some syscalls. */ - if ((window > conn->wnd_from_tap && - window * 99 / 100 < conn->wnd_from_tap) || - (window < conn->wnd_from_tap && - window * 101 / 100 > conn->wnd_from_tap)) { - conn->wnd_from_tap = window; + if ((window > prev_scaled && + window * 99 / 100 < prev_scaled) || + (window < prev_scaled && + window * 101 / 100 > prev_scaled)) { + conn->wnd_from_tap = window >> + conn->ws_from_tap; return; } } - conn->wnd_from_tap = window; if (window < 256) window = 256; + + conn->wnd_from_tap = window >> conn->ws_from_tap; setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &window, sizeof(window)); conn_flag(c, conn, WND_CLAMPED); @@ -2090,7 +2113,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, const struct sockaddr *sa; struct tcp_conn *conn; socklen_t sl; - int s; + int s, mss; if (c->tcp.conn_count >= TCP_MAX_CONNS) return; @@ -2120,14 +2143,14 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, conn = CONN(c->tcp.conn_count++); conn->sock = s; conn->timer = -1; + conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; - conn->tap_mss = tcp_conn_tap_mss(c, conn, th, len); - - sl = sizeof(conn->tap_mss); - setsockopt(s, SOL_TCP, TCP_MAXSEG, &conn->tap_mss, sl); + mss = tcp_conn_tap_mss(c, conn, th, len); + setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)); + MSS_SET(conn, mss); tcp_clamp_window(c, conn, th, len, 0, 1); @@ -2153,7 +2176,6 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, conn->seq_ack_to_tap = conn->seq_from_tap; conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now); - conn->seq_init_to_tap = conn->seq_to_tap; conn->seq_ack_from_tap = conn->seq_to_tap + 1; tcp_hash_insert(c, conn, af, addr); @@ -2256,10 +2278,12 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_conn *conn, ssize_t plen, */ static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn) { + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, plen, v4 = CONN_V4(conn); int s = conn->sock, i, ret = 0; struct msghdr mh_sock = { 0 }; + uint16_t mss = MSS_GET(conn); uint32_t already_sent; struct iovec *iov; @@ -2271,20 +2295,19 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn) already_sent = 0; } - if (!conn->wnd_from_tap || already_sent >= conn->wnd_from_tap) { + if (!wnd_scaled || already_sent >= wnd_scaled) { conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } /* Set up buffer descriptors we'll fill completely and partially. */ - fill_bufs = DIV_ROUND_UP(conn->wnd_from_tap - already_sent, - conn->tap_mss); + fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); if (fill_bufs > TCP_FRAMES) { fill_bufs = TCP_FRAMES; iov_rem = 0; } else { - iov_rem = (conn->wnd_from_tap - already_sent) % conn->tap_mss; + iov_rem = (wnd_scaled - already_sent) % mss; } mh_sock.msg_iov = iov_sock; @@ -2302,7 +2325,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_conn *conn) iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; else iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; - iov->iov_len = conn->tap_mss; + iov->iov_len = mss; } if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; @@ -2327,14 +2350,14 @@ recvmsg: conn_flag(c, conn, ~STALLED); - send_bufs = DIV_ROUND_UP(sendlen, conn->tap_mss); - last_len = sendlen - (send_bufs - 1) * conn->tap_mss; + send_bufs = DIV_ROUND_UP(sendlen, mss); + last_len = sendlen - (send_bufs - 1) * mss; /* Likely, some new data was acked too. */ tcp_update_seqack_wnd(c, conn, 0, NULL); /* Finally, queue to tap */ - plen = conn->tap_mss; + plen = mss; for (i = 0; i < send_bufs; i++) { int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; @@ -2383,8 +2406,8 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, struct tap_l4_msg *msg, int count) { int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1; - uint32_t max_ack_seq = conn->seq_ack_from_tap; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; + uint32_t max_ack_seq = conn->seq_ack_from_tap; uint32_t seq_from_tap = conn->seq_from_tap; struct msghdr mh = { .msg_iov = tcp_iov }; int partial_send = 0; @@ -2541,8 +2564,12 @@ eintr: out: if (keep != -1) { - if (conn->seq_dup_ack != conn->seq_from_tap) { - conn->seq_dup_ack = conn->seq_from_tap; + /* We use an 8-bit approximation here: the associated risk is + * that we skip a duplicate ACK on 8-bit sequence number + * collision. Fast retransmit is a SHOULD in RFC 5681, 3.2. + */ + if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) { + conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff; tcp_send_flag(c, conn, DUP_ACK); } return; @@ -2572,7 +2599,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, struct tcphdr *th, size_t len) { tcp_clamp_window(c, conn, th, len, 0, 1); - conn->tap_mss = tcp_conn_tap_mss(c, conn, th, len); + MSS_SET(conn, tcp_conn_tap_mss(c, conn, th, len)); conn->seq_init_from_tap = ntohl(th->seq) + 1; conn->seq_from_tap = conn->seq_init_from_tap; @@ -2744,6 +2771,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn = CONN(c->tcp.conn_count++); conn->sock = s; conn->timer = -1; + conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, SOCK_ACCEPTED); if (ref.r.p.tcp.tcp.v6) { @@ -2773,7 +2801,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->sock_port, conn->tap_port, now); - conn->seq_init_to_tap = conn->seq_to_tap; tcp_hash_insert(c, conn, AF_INET6, &sa6.sin6_addr); } else { @@ -2800,7 +2827,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, conn->sock_port, conn->tap_port, now); - conn->seq_init_to_tap = conn->seq_to_tap; tcp_hash_insert(c, conn, AF_INET, &s_addr); } @@ -2822,9 +2848,12 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, */ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) { - struct tcp_conn *conn = CONN(ref.r.p.tcp.tcp.index); + struct tcp_conn *conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index); struct epoll_event ev = { 0 }; + if (!conn) + return; + if (CONN_IS_CLOSED(conn)) { tcp_hash_remove(conn); tcp_table_compact(c, conn); @@ -2898,7 +2927,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, return; } - if (!(conn = CONN(ref.r.p.tcp.tcp.index))) + if (!(conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index))) return; if (events & EPOLLERR) { @@ -3098,7 +3127,8 @@ static int tcp_sock_refill(void *arg) return -EIO; } - tcp_sock_set_bufsize(a->c, *p4); + if (*p4 >= 0) + tcp_sock_set_bufsize(a->c, *p4); } for (i = 0; a->c->v6 && i < TCP_SOCK_POOL_SIZE; i++, p6++) { @@ -3113,7 +3143,8 @@ static int tcp_sock_refill(void *arg) return -EIO; } - tcp_sock_set_bufsize(a->c, *p6); + if (*p6 >= 0) + tcp_sock_set_bufsize(a->c, *p6); } return 0; diff --git a/tcp.h b/tcp.h index 3154b4b..109516d 100644 --- a/tcp.h +++ b/tcp.h @@ -6,11 +6,12 @@ #ifndef TCP_H #define TCP_H -#define REFILL_INTERVAL 1000 /* ms */ +#define REFILL_INTERVAL 1000 /* ms */ #define PORT_DETECT_INTERVAL 1000 #define TCP_TIMER_INTERVAL MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL) -#define TCP_MAX_CONNS (128 * 1024) +#define TCP_CONN_INDEX_BITS 17 /* 128k */ +#define TCP_MAX_CONNS (1 << TCP_CONN_INDEX_BITS) #define TCP_MAX_SOCKS (TCP_MAX_CONNS + USHRT_MAX * 2) #define TCP_SOCK_POOL_SIZE 32 -- 2.35.1
In section 3 ("Packet Format"), "vend" is 64 bytes long, minus the magic that's 60 bytes, not 62. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- dhcp.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dhcp.c b/dhcp.c index 197a515..d24ef86 100644 --- a/dhcp.c +++ b/dhcp.c @@ -57,6 +57,8 @@ static struct opt opts[255]; #define DHCPINFORM 8 #define DHCPFORCERENEW 9 +#define OPT_MIN 60 /* RFC 951 */ + /** * dhcp_init() - Initialise DHCP options */ @@ -158,9 +160,9 @@ static int fill(struct msg *m) m->o[offset++] = 255; m->o[offset++] = 0; - if (offset < 62 /* RFC 951 */) { - memset(&m->o[offset], 0, 62 - offset); - offset = 62; + if (offset < OPT_MIN) { + memset(&m->o[offset], 0, OPT_MIN - offset); + offset = OPT_MIN; } return offset; -- 2.35.1
Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- tcp.c | 9 ++++----- tcp_splice.c | 13 ++++++------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/tcp.c b/tcp.c index 3f61e6a..e0916e0 100644 --- a/tcp.c +++ b/tcp.c @@ -2031,13 +2031,12 @@ static uint32_t tcp_seq_init(struct ctx *c, int af, void *addr, */ static int tcp_conn_new_sock(struct ctx *c, sa_family_t af) { - int *pool = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4, i, s; + int *p = af == AF_INET6 ? init_sock_pool6 : init_sock_pool4, i, s = -1; - for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, pool++) { - if ((s = *pool) >= 0) { - *pool = -1; + for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) { + SWAP(s, *p); + if (s >= 0) break; - } } if (s < 0) diff --git a/tcp_splice.c b/tcp_splice.c index b7bdfc2..bcafd33 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -501,18 +501,17 @@ static int tcp_splice_new(struct ctx *c, struct tcp_splice_conn *conn, in_port_t port) { struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 }; - int *sock_pool_p, i, s = -1; + int *p, i, s = -1; if (bitmap_isset(c->tcp.port_to_tap, port)) - sock_pool_p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; + p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; else - sock_pool_p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; + p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; - for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, sock_pool_p++) { - if ((s = *sock_pool_p) >= 0) { - *sock_pool_p = -1; + for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) { + SWAP(s, *p); + if (s >= 0) break; - } } if (s < 0 && bitmap_isset(c->tcp.port_to_tap, port)) { -- 2.35.1
Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/util.c b/util.c index ff7d97b..0adc6b9 100644 --- a/util.c +++ b/util.c @@ -544,7 +544,8 @@ int ns_enter(struct ctx *c) * @fd: Open PID file descriptor, closed on exit, -1 to skip writing it * @pid: PID value to write */ -void write_pidfile(int fd, pid_t pid) { +void write_pidfile(int fd, pid_t pid) +{ char pid_buf[12]; int n; -- 2.35.1
Implement a packet abstraction providing boundary and size checks based on packet descriptors: packets stored in a buffer can be queued into a pool (without storage of its own), and data can be retrieved referring to an index in the pool, specifying offset and length. Checks ensure data is not read outside the boundaries of buffer and descriptors, and that packets added to a pool are within the buffer range with valid offset and indices. This implies a wider rework: usage of the "queueing" part of the abstraction mostly affects tap_handler_{passt,pasta}() functions and their callees, while the "fetching" part affects all the guest or tap facing implementations: TCP, UDP, ICMP, ARP, NDP, DHCP and DHCPv6 handlers. Suggested-by: Stefan Hajnoczi <stefanha(a)redhat.com> Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- README.md | 2 +- arp.c | 51 +++--- arp.h | 2 +- dhcp.c | 54 ++++--- dhcp.h | 2 +- dhcpv6.c | 151 ++++++++--------- dhcpv6.h | 3 +- icmp.c | 28 ++-- icmp.h | 4 +- ndp.c | 59 +++---- ndp.h | 3 +- packet.c | 134 +++++++++++++++ packet.h | 77 +++++++++ passt.h | 1 + tap.c | 329 ++++++++++++++++++------------------- tcp.c | 449 ++++++++++++++++++++++++++++----------------------- tcp.h | 16 +- tcp_splice.c | 216 +++++++++++++------------ tcp_splice.h | 2 +- udp.c | 46 +++--- udp.h | 6 +- util.c | 60 ++++--- util.h | 5 +- 23 files changed, 990 insertions(+), 710 deletions(-) create mode 100644 packet.c create mode 100644 packet.h diff --git a/README.md b/README.md index 1ef3a73..621505a 100644 --- a/README.md +++ b/README.md @@ -291,7 +291,7 @@ speeding up local connections, and usually requiring NAT. _pasta_: * ✅ restrictive seccomp profiles (24 syscalls allowed for _passt_, 36 for _pasta_ on x86_64) * ✅ static checkers in continuous integration (clang-tidy, cppcheck) -* 🛠️ clearly defined packet abstraction +* ✅️ clearly defined boundary-checked packet abstraction * 🛠️ ~5 000 LoC target * ⌚ [fuzzing](https://bugs.passt.top/show_bug.cgi?id=9), _packetdrill_ tests * ⌚ stricter [synflood protection](https://bugs.passt.top/show_bug.cgi?id=10) diff --git a/arp.c b/arp.c index 3195692..bcce804 100644 --- a/arp.c +++ b/arp.c @@ -30,53 +30,56 @@ #include "tap.h" /** - * arp() - Check if this is an ARP message, reply as needed + * arp() - Check if this is a supported ARP message, reply as needed * @c: Execution context - * @len: Total L2 packet length - * @eh: Packet buffer, Ethernet header + * @p: Packet pool, single packet with Ethernet buffer * - * Return: 0 if it's not an ARP message, 1 if handled, -1 on failure + * Return: 1 if handled, -1 on failure */ -int arp(struct ctx *c, struct ethhdr *eh, size_t len) +int arp(struct ctx *c, struct pool *p) { - struct arphdr *ah = (struct arphdr *)(eh + 1); - struct arpmsg *am = (struct arpmsg *)(ah + 1); unsigned char swap[4]; + struct ethhdr *eh; + struct arphdr *ah; + struct arpmsg *am; + size_t len; - if (eh->h_proto != htons(ETH_P_ARP)) - return 0; + eh = packet_get(p, 0, 0, sizeof(*eh), NULL); + ah = packet_get(p, 0, sizeof(*eh), sizeof(*ah), NULL); + am = packet_get(p, 0, sizeof(*eh) + sizeof(*ah), sizeof(*am), NULL); - if (len < sizeof(*eh) + sizeof(*ah) + sizeof(*am)) + if (!eh || !ah || !am) return -1; - if (ah->ar_hrd != htons(ARPHRD_ETHER) || - ah->ar_pro != htons(ETH_P_IP) || - ah->ar_hln != ETH_ALEN || ah->ar_pln != 4 || - ah->ar_op != htons(ARPOP_REQUEST)) + if (ah->ar_hrd != htons(ARPHRD_ETHER) || + ah->ar_pro != htons(ETH_P_IP) || + ah->ar_hln != ETH_ALEN || + ah->ar_pln != 4 || + ah->ar_op != htons(ARPOP_REQUEST)) return 1; /* Discard announcements (but not 0.0.0.0 "probes"): we might have the * same IP address, hide that. */ - if (memcmp(am->sip, (unsigned char[4]){ 0, 0, 0, 0 }, 4) && - !memcmp(am->sip, am->tip, 4)) + if (memcmp(am->sip, (unsigned char[4]){ 0 }, sizeof(am->tip)) && + !memcmp(am->sip, am->tip, sizeof(am->sip))) return 1; /* Don't resolve our own address, either. */ - if (!memcmp(am->tip, &c->addr4, 4)) + if (!memcmp(am->tip, &c->addr4, sizeof(am->tip))) return 1; ah->ar_op = htons(ARPOP_REPLY); - memcpy(am->tha, am->sha, ETH_ALEN); - memcpy(am->sha, c->mac, ETH_ALEN); + memcpy(am->tha, am->sha, sizeof(am->tha)); + memcpy(am->sha, c->mac, sizeof(am->sha)); - memcpy(swap, am->tip, 4); - memcpy(am->tip, am->sip, 4); - memcpy(am->sip, swap, 4); + memcpy(swap, am->tip, sizeof(am->tip)); + memcpy(am->tip, am->sip, sizeof(am->tip)); + memcpy(am->sip, swap, sizeof(am->sip)); len = sizeof(*eh) + sizeof(*ah) + sizeof(*am); - memcpy(eh->h_dest, eh->h_source, ETH_ALEN); - memcpy(eh->h_source, c->mac, ETH_ALEN); + memcpy(eh->h_dest, eh->h_source, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); if (tap_send(c, eh, len, 0) < 0) perror("ARP: send"); diff --git a/arp.h b/arp.h index a198969..6ef3736 100644 --- a/arp.h +++ b/arp.h @@ -17,4 +17,4 @@ struct arpmsg { unsigned char tip[4]; } __attribute__((__packed__)); -int arp(struct ctx *c, struct ethhdr *eh, size_t len); +int arp(struct ctx *c, struct pool *p); diff --git a/dhcp.c b/dhcp.c index d24ef86..81c0ff9 100644 --- a/dhcp.c +++ b/dhcp.c @@ -22,9 +22,11 @@ #include <stdint.h> #include <unistd.h> #include <string.h> +#include <limits.h> #include "util.h" #include "checksum.h" +#include "packet.h" #include "passt.h" #include "tap.h" #include "dhcp.h" @@ -257,27 +259,32 @@ static void opt_set_dns_search(struct ctx *c, size_t max_len) /** * dhcp() - Check if this is a DHCP message, reply as needed * @c: Execution context - * @len: Total L2 packet length - * @eh: Packet buffer, Ethernet header + * @p: Packet pool, single packet with Ethernet buffer * * Return: 0 if it's not a DHCP message, 1 if handled, -1 on failure */ -int dhcp(struct ctx *c, struct ethhdr *eh, size_t len) +int dhcp(struct ctx *c, struct pool *p) { - struct iphdr *iph = (struct iphdr *)(eh + 1); - size_t mlen, olen; + size_t mlen, len, offset = 0; + struct ethhdr *eh; + struct iphdr *iph; struct udphdr *uh; unsigned int i; struct msg *m; - if (len < sizeof(*eh) + sizeof(*iph)) - return 0; + eh = packet_get(p, 0, offset, sizeof(*eh), NULL); + offset += sizeof(*eh); - if (len < sizeof(*eh) + (long)iph->ihl * 4 + sizeof(*uh)) - return 0; + iph = packet_get(p, 0, offset, sizeof(*iph), NULL); + if (!eh || !iph) + return -1; - uh = (struct udphdr *)((char *)iph + (long)(iph->ihl * 4)); - m = (struct msg *)(uh + 1); + offset += iph->ihl * 4UL; + uh = packet_get(p, 0, offset, sizeof(*uh), &mlen); + offset += sizeof(*uh); + + if (!uh) + return -1; if (uh->dest != htons(67)) return 0; @@ -285,18 +292,29 @@ int dhcp(struct ctx *c, struct ethhdr *eh, size_t len) if (c->no_dhcp) return 1; - mlen = len - sizeof(*eh) - (long)iph->ihl * 4 - sizeof(*uh); - if (mlen != ntohs(uh->len) - sizeof(*uh) || - mlen < offsetof(struct msg, o) || + m = packet_get(p, 0, offset, offsetof(struct msg, o), NULL); + if (!m || + mlen != ntohs(uh->len) - sizeof(*uh) || + mlen < offsetof(struct msg, o) || m->op != BOOTREQUEST) return -1; - olen = mlen - offsetof(struct msg, o); - for (i = 0; i + 2 < olen; i += m->o[i + 1] + 2) { - if (m->o[i + 1] + i + 2 >= olen) + offset += offsetof(struct msg, o); + + while (offset + 2 < mlen) { + uint8_t *olen, *type, *val; + + type = packet_get(p, 0, offset, 1, NULL); + olen = packet_get(p, 0, offset + 1, 1, NULL); + if (!type || !olen) + return -1; + + val = packet_get(p, 0, offset + 2, *olen, NULL); + if (!val) return -1; - memcpy(&opts[m->o[i]].c, &m->o[i + 2], m->o[i + 1]); + memcpy(&opts[*type].c, val, *olen); + offset += *olen + 2; } if (opts[53].c[0] == DHCPDISCOVER) { diff --git a/dhcp.h b/dhcp.h index 91697b5..7c72fd2 100644 --- a/dhcp.h +++ b/dhcp.h @@ -3,5 +3,5 @@ * Author: Stefano Brivio <sbrivio(a)redhat.com> */ -int dhcp(struct ctx *c, struct ethhdr *eh, size_t len); +int dhcp(struct ctx *c, struct pool *p); void dhcp_init(void); diff --git a/dhcpv6.c b/dhcpv6.c index 375ba79..5c9ea88 100644 --- a/dhcpv6.c +++ b/dhcpv6.c @@ -24,7 +24,9 @@ #include <unistd.h> #include <string.h> #include <time.h> +#include <limits.h> +#include "packet.h" #include "util.h" #include "passt.h" #include "tap.h" @@ -69,6 +71,8 @@ struct opt_hdr { #endif #define OPT_SIZE(x) OPT_SIZE_CONV(sizeof(struct opt_##x) - \ sizeof(struct opt_hdr)) +#define OPT_VSIZE(x) (sizeof(struct opt_##x) - \ + sizeof(struct opt_hdr)) /** * struct opt_client_id - DHCPv6 Client Identifier option @@ -265,10 +269,10 @@ static const struct opt_status_code sc_not_on_link = { /** * struct resp_not_on_link_t - NotOnLink error (mandated by RFC 8415, 18.3.2.) - * @uh: UDP header - * @hdr: DHCP message header - * @server_id: Server Identifier option - * @var: Payload: IA_NA from client, status code, client ID + * @uh: UDP header + * @hdr: DHCP message header + * @server_id: Server Identifier option + * @var: Payload: IA_NA from client, status code, client ID */ static struct resp_not_on_link_t { struct udphdr uh; @@ -287,26 +291,30 @@ static struct resp_not_on_link_t { /** * dhcpv6_opt() - Get option from DHCPv6 message - * @o: First option header to check - * @type: Option type to look up, network order - * @len: Remaining length, host order, modified on return + * @p: Packet pool, single packet with UDP header + * @offset: Offset to look at, 0: end of header, set to option start + * @type: Option type to look up, network order * * Return: pointer to option header, or NULL on malformed or missing option */ -static struct opt_hdr *dhcpv6_opt(struct opt_hdr *o, uint16_t type, size_t *len) +static struct opt_hdr *dhcpv6_opt(struct pool *p, size_t *offset, uint16_t type) { - while (*len >= sizeof(struct opt_hdr)) { - unsigned int opt_len = ntohs(o->l) + sizeof(struct opt_hdr); + struct opt_hdr *o; + size_t left; - if (opt_len > *len) - return NULL; + if (!*offset) + *offset = sizeof(struct udphdr) + sizeof(struct msg_hdr); + + while ((o = packet_get_try(p, 0, *offset, sizeof(*o), &left))) { + unsigned int opt_len = ntohs(o->l) + sizeof(*o); - *len -= opt_len; + if (ntohs(o->l) > left) + return NULL; if (o->t == type) return o; - o = (struct opt_hdr *)((uint8_t *)o + opt_len); + *offset += opt_len; } return NULL; @@ -314,61 +322,45 @@ static struct opt_hdr *dhcpv6_opt(struct opt_hdr *o, uint16_t type, size_t *len) /** * dhcpv6_ia_notonlink() - Check if any IA contains non-appropriate addresses - * @o: First option header to check for IAs - * @rem_len: Remaining message length, host order - * @addr: Address we want to lease to the client + * @o: First option header to check for IAs + * @rem_len: Remaining message length, host order + * @addr: Address we want to lease to the client * * Return: pointer to non-appropriate IA_NA or IA_TA, if any, NULL otherwise */ -static struct opt_hdr *dhcpv6_ia_notonlink(struct opt_hdr *o, size_t rem_len, - struct in6_addr *addr) +static struct opt_hdr *dhcpv6_ia_notonlink(struct pool *p, struct in6_addr *la) { - struct opt_hdr *ia, *ia_addr; char buf[INET6_ADDRSTRLEN]; struct in6_addr *req_addr; - size_t len; + struct opt_hdr *ia, *h; + size_t offset; int ia_type; ia_type = OPT_IA_NA; ia_ta: - len = rem_len; - ia = o; - - while ((ia = dhcpv6_opt(ia, ia_type, &len))) { - size_t ia_len = ntohs(ia->l); - - if (ia_type == OPT_IA_NA) { - struct opt_ia_na *subopt = (struct opt_ia_na *)ia + 1; - - ia_addr = (struct opt_hdr *)subopt; - } else if (ia_type == OPT_IA_TA) { - struct opt_ia_ta *subopt = (struct opt_ia_ta *)ia + 1; - - ia_addr = (struct opt_hdr *)subopt; - } + offset = 0; + while ((ia = dhcpv6_opt(p, &offset, ia_type))) { + if (ntohs(ia->l) < OPT_VSIZE(ia_na)) + return NULL; - ia_len -= sizeof(struct opt_ia_na) - sizeof(struct opt_hdr); + offset += sizeof(struct opt_ia_na); - while ((ia_addr = dhcpv6_opt(ia_addr, OPT_IAAADR, &ia_len))) { - struct opt_ia_addr *next; + while ((h = dhcpv6_opt(p, &offset, OPT_IAAADR))) { + struct opt_ia_addr *opt_addr = (struct opt_ia_addr *)h; - req_addr = (struct in6_addr *)(ia_addr + 1); + if (ntohs(h->l) != OPT_VSIZE(ia_addr)) + return NULL; - if (!IN6_ARE_ADDR_EQUAL(addr, req_addr)) { + req_addr = &opt_addr->addr; + if (!IN6_ARE_ADDR_EQUAL(la, req_addr)) { info("DHCPv6: requested address %s not on link", inet_ntop(AF_INET6, req_addr, buf, sizeof(buf))); return ia; } - next = (struct opt_ia_addr *)ia_addr + 1; - ia_addr = (struct opt_hdr *)next; + offset += sizeof(struct opt_ia_addr); } - - if (!ia_addr) - break; - - ia = ia_addr; } if (ia_type == OPT_IA_NA) { @@ -449,59 +441,58 @@ search: /** * dhcpv6() - Check if this is a DHCPv6 message, reply as needed * @c: Execution context - * @eh: Packet buffer, Ethernet header - * @len: Total L2 packet length + * @p: Packet pool, single packet starting from UDP header + * @saddr: Source IPv6 address of original message + * @daddr: Destination IPv6 address of original message * * Return: 0 if it's not a DHCPv6 message, 1 if handled, -1 on failure */ -int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len) +int dhcpv6(struct ctx *c, struct pool *p, + const struct in6_addr *saddr, const struct in6_addr *daddr) { - struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); struct opt_hdr *ia, *bad_ia, *client_id, *server_id; struct in6_addr *src; struct msg_hdr *mh; struct udphdr *uh; - uint8_t proto; - size_t mlen; - size_t n; + size_t mlen, n; - uh = (struct udphdr *)ipv6_l4hdr(ip6h, &proto); - if (!uh || proto != IPPROTO_UDP || uh->dest != htons(547)) + uh = packet_get(p, 0, 0, sizeof(*uh), &mlen); + if (!uh) + return -1; + + if (uh->dest != htons(547)) return 0; if (c->no_dhcpv6) return 1; - if (!IN6_IS_ADDR_MULTICAST(&ip6h->daddr)) + if (!IN6_IS_ADDR_MULTICAST(daddr)) return -1; - mlen = len - ((intptr_t)uh - (intptr_t)eh) - sizeof(*uh); - - if (mlen != ntohs(uh->len) - sizeof(*uh) || - mlen < sizeof(struct msg_hdr)) + if (mlen + sizeof(*uh) != ntohs(uh->len) || mlen < sizeof(*mh)) return -1; - c->addr6_ll_seen = ip6h->saddr; + c->addr6_ll_seen = *saddr; if (IN6_IS_ADDR_LINKLOCAL(&c->gw6)) src = &c->gw6; else src = &c->addr6_ll; - mh = (struct msg_hdr *)(uh + 1); - mlen -= sizeof(struct msg_hdr); + mh = packet_get(p, 0, sizeof(*uh), sizeof(*mh), NULL); + if (!mh) + return -1; - n = mlen; - client_id = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_CLIENTID, &n); - if (!client_id || ntohs(client_id->l) > ntohs(OPT_SIZE(client_id))) + client_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_CLIENTID); + if (!client_id || ntohs(client_id->l) > OPT_VSIZE(client_id)) return -1; - n = mlen; - server_id = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_SERVERID, &n); + server_id = dhcpv6_opt(p, &(size_t){ 0 }, OPT_SERVERID); + if (server_id && ntohs(server_id->l) != OPT_VSIZE(server_id)) + return -1; - n = mlen; - ia = dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_IA_NA, &n); - if (ia && ntohs(ia->l) < ntohs(OPT_SIZE(ia_na))) + ia = dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_NA); + if (ia && ntohs(ia->l) < MIN(OPT_VSIZE(ia_na), OPT_VSIZE(ia_ta))) return -1; resp.hdr.type = TYPE_REPLY; @@ -516,18 +507,17 @@ int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len) if (mh->type == TYPE_CONFIRM && server_id) return -1; - if ((bad_ia = dhcpv6_ia_notonlink((struct opt_hdr *)(mh + 1), - mlen, &c->addr6))) { + if ((bad_ia = dhcpv6_ia_notonlink(p, &c->addr6))) { info("DHCPv6: received CONFIRM with inappropriate IA," " sending NotOnLink status in REPLY"); - n = ntohs(bad_ia->l) + sizeof(struct opt_hdr); - bad_ia->l = htons(n - sizeof(struct opt_hdr) + + bad_ia->l = htons(OPT_VSIZE(ia_na) + sizeof(sc_not_on_link)); + n = sizeof(struct opt_ia_na); memcpy(resp_not_on_link.var, bad_ia, n); - memcpy(resp_not_on_link.var + n, &sc_not_on_link, - sizeof(sc_not_on_link)); + memcpy(resp_not_on_link.var + n, + &sc_not_on_link, sizeof(sc_not_on_link)); n += sizeof(sc_not_on_link); memcpy(resp_not_on_link.var + n, client_id, @@ -552,8 +542,7 @@ int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len) memcmp(&resp.server_id, server_id, sizeof(resp.server_id))) return -1; - n = mlen; - if (ia || dhcpv6_opt((struct opt_hdr *)(mh + 1), OPT_IA_TA, &n)) + if (ia || dhcpv6_opt(p, &(size_t){ 0 }, OPT_IA_TA)) return -1; info("DHCPv6: received INFORMATION_REQUEST, sending REPLY"); diff --git a/dhcpv6.h b/dhcpv6.h index 36b6a57..73d28d3 100644 --- a/dhcpv6.h +++ b/dhcpv6.h @@ -3,5 +3,6 @@ * Author: Stefano Brivio <sbrivio(a)redhat.com> */ -int dhcpv6(struct ctx *c, struct ethhdr *eh, size_t len); +int dhcpv6(struct ctx *c, struct pool *p, + struct in6_addr *saddr, struct in6_addr *daddr); void dhcpv6_init(struct ctx *c); diff --git a/icmp.c b/icmp.c index 67859e0..80feb00 100644 --- a/icmp.c +++ b/icmp.c @@ -31,9 +31,11 @@ #include <linux/icmpv6.h> +#include "packet.h" #include "util.h" #include "passt.h" #include "tap.h" +#include "packet.h" #include "icmp.h" #define ICMP_ECHO_TIMEOUT 60 /* s, timeout for ICMP socket activity */ @@ -134,18 +136,14 @@ void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, * icmp_tap_handler() - Handle packets from tap * @c: Execution context * @af: Address family, AF_INET or AF_INET6 - * @ - * @msg: Input message - * @count: Message count (always 1 for ICMP) + * @p: Packet pool, single packet with ICMP/ICMPv6 header * @now: Current timestamp * * Return: count of consumed packets (always 1, even if malformed) */ -int icmp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now) +int icmp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now) { - (void)count; - if (af == AF_INET) { union icmp_epoll_ref iref = { .icmp.v6 = 0 }; struct sockaddr_in sa = { @@ -155,9 +153,8 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr, struct icmphdr *ih; int id, s; - ih = (struct icmphdr *)(pkt_buf + msg[0].pkt_buf_offset); - - if (msg[0].l4_len < sizeof(*ih) || ih->type != ICMP_ECHO) + ih = packet_get(p, 0, 0, sizeof(*ih), NULL); + if (!ih) return 1; sa.sin_port = ih->un.echo.id; @@ -175,7 +172,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr, bitmap_set(icmp_act[V4], id); sa.sin_addr = *(struct in_addr *)addr; - sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL, + sendto(s, ih, sizeof(*ih), MSG_NOSIGNAL, (struct sockaddr *)&sa, sizeof(sa)); } else if (af == AF_INET6) { union icmp_epoll_ref iref = { .icmp.v6 = 1 }; @@ -186,10 +183,11 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr, struct icmp6hdr *ih; int id, s; - ih = (struct icmp6hdr *)(pkt_buf + msg[0].pkt_buf_offset); + ih = packet_get(p, 0, 0, sizeof(struct icmp6hdr), NULL); + if (!ih) + return 1; - if (msg[0].l4_len < sizeof(*ih) || - (ih->icmp6_type != 128 && ih->icmp6_type != 129)) + if (ih->icmp6_type != 128 && ih->icmp6_type != 129) return 1; sa.sin6_port = ih->icmp6_identifier; @@ -207,7 +205,7 @@ int icmp_tap_handler(struct ctx *c, int af, void *addr, bitmap_set(icmp_act[V6], id); sa.sin6_addr = *(struct in6_addr *)addr; - sendto(s, ih, msg[0].l4_len, MSG_NOSIGNAL, + sendto(s, ih, sizeof(*ih), MSG_NOSIGNAL, (struct sockaddr *)&sa, sizeof(sa)); } diff --git a/icmp.h b/icmp.h index 89b5f55..2152a66 100644 --- a/icmp.h +++ b/icmp.h @@ -12,8 +12,8 @@ struct ctx; void icmp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now); -int icmp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now); +int icmp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now); void icmp_timer(struct ctx *c, struct timespec *ts); /** diff --git a/ndp.c b/ndp.c index 6b1c1a8..b40a0c4 100644 --- a/ndp.c +++ b/ndp.c @@ -39,28 +39,23 @@ /** * ndp() - Check for NDP solicitations, reply as needed * @c: Execution context - * @len: Total L2 packet length - * @eh: Packet buffer, Ethernet header + * @ih: ICMPv6 header + * @eh_source: Source Ethernet address + * @saddr Source IPv6 address * * Return: 0 if not handled here, 1 if handled, -1 on failure */ -int ndp(struct ctx *c, struct ethhdr *eh, size_t len) +int ndp(struct ctx *c, struct icmp6hdr *ih, unsigned char *eh_source, + struct in6_addr *saddr) { - struct ethhdr *ehr; - struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1), *ip6hr; - struct icmp6hdr *ih, *ihr; char buf[BUFSIZ] = { 0 }; - uint8_t proto, *p; - - if (len < sizeof(*ehr) + sizeof(*ip6h) + sizeof(*ih)) - return 0; - - ih = (struct icmp6hdr *)ipv6_l4hdr(ip6h, &proto); - if (!ih) - return -1; + struct ipv6hdr *ip6hr; + struct icmp6hdr *ihr; + struct ethhdr *ehr; + unsigned char *p; + size_t len; - if (proto != IPPROTO_ICMPV6 || - ih->icmp6_type < RS || ih->icmp6_type > NA) + if (ih->icmp6_type < RS || ih->icmp6_type > NA) return 0; if (c->no_ndp) @@ -71,11 +66,7 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len) ihr = (struct icmp6hdr *)(ip6hr + 1); if (ih->icmp6_type == NS) { - if (len < sizeof(*ehr) + sizeof(*ip6h) + sizeof(*ih) + - sizeof(struct in6_addr)) - return -1; - - if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->saddr)) + if (IN6_IS_ADDR_UNSPECIFIED(saddr)) return 1; info("NDP: received NS, sending NA"); @@ -132,10 +123,10 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len) for (n = 0; !IN6_IS_ADDR_UNSPECIFIED(&c->dns6[n]); n++); if (n) { - *p++ = 25; /* RDNSS */ - *p++ = 1 + 2 * n; /* length */ - p += 2; /* reserved */ - *(uint32_t *)p = htonl(60); /* lifetime */ + *p++ = 25; /* RDNSS */ + *p++ = 1 + 2 * n; /* length */ + p += 2; /* reserved */ + *(uint32_t *)p = htonl(60); /* lifetime */ p += 4; for (i = 0; i < n; i++) { @@ -148,10 +139,10 @@ int ndp(struct ctx *c, struct ethhdr *eh, size_t len) } if (!c->no_dhcp_dns_search && dns_s_len) { - *p++ = 31; /* DNSSL */ - *p++ = (len + 8 - 1) / 8 + 1; /* length */ - p += 2; /* reserved */ - *(uint32_t *)p = htonl(60); /* lifetime */ + *p++ = 31; /* DNSSL */ + *p++ = (dns_s_len + 8 - 1) / 8 + 1; /* length */ + p += 2; /* reserved */ + *(uint32_t *)p = htonl(60); /* lifetime */ p += 4; for (i = 0; i < n; i++) { @@ -185,12 +176,12 @@ dns_done: len = (uintptr_t)p - (uintptr_t)ihr - sizeof(*ihr); - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) - c->addr6_ll_seen = ip6h->saddr; + if (IN6_IS_ADDR_LINKLOCAL(saddr)) + c->addr6_ll_seen = *saddr; else - c->addr6_seen = ip6h->saddr; + c->addr6_seen = *saddr; - ip6hr->daddr = ip6h->saddr; + ip6hr->daddr = *saddr; if (IN6_IS_ADDR_LINKLOCAL(&c->gw6)) ip6hr->saddr = c->gw6; else @@ -207,7 +198,7 @@ dns_done: ip6hr->hop_limit = 255; len += sizeof(*ehr) + sizeof(*ip6hr) + sizeof(*ihr); - memcpy(ehr->h_dest, eh->h_source, ETH_ALEN); + memcpy(ehr->h_dest, eh_source, ETH_ALEN); memcpy(ehr->h_source, c->mac, ETH_ALEN); ehr->h_proto = htons(ETH_P_IPV6); diff --git a/ndp.h b/ndp.h index 918fb66..a26673e 100644 --- a/ndp.h +++ b/ndp.h @@ -3,4 +3,5 @@ * Author: Stefano Brivio <sbrivio(a)redhat.com> */ -int ndp(struct ctx *c, struct ethhdr *eh, size_t len); +int ndp(struct ctx *c, struct icmp6hdr *ih, unsigned char *eh_source, + struct in6_addr *saddr); diff --git a/packet.c b/packet.c new file mode 100644 index 0000000..876a342 --- /dev/null +++ b/packet.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * packet.c - Packet abstraction: add packets to pool, flush, get packet data + * + * Copyright (c) 2020-2021 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#include <limits.h> +#include <stddef.h> +#include <stdint.h> + +#include <netinet/ip6.h> + +#include "packet.h" +#include "util.h" + +/** + * packet_add_do() - Add data as packet descriptor to given pool + * @p: Existing pool + * @len: Length of new descriptor + * @start: Start of data + * @func: For tracing: name of calling function, NULL means no trace() + * @line: For tracing: caller line of function call + */ +void packet_add_do(struct pool *p, size_t len, const char *start, + const char *func, const int line) +{ + size_t index = p->count; + + if (index >= p->size) { + trace("add packet index %lu to pool with size %lu, %s:%i", + index, p->size, func, line); + return; + } + + if (start < p->buf) { + trace("add packet start %p before buffer start %p, %s:%i", + start, p->buf, func, line); + return; + } + + if (start + len > p->buf + p->buf_size) { + trace("add packet start %p, length: %lu, buffer end %p, %s:%i", + start, len, p->buf + p->buf_size, func, line); + return; + } + + if (len > UINT16_MAX) { + trace("add packet length %lu, %s:%i", func, line); + return; + } + + if ((intptr_t)start - (intptr_t)p->buf > UINT32_MAX) { + trace("add packet start %p, buffer start %lu, %s:%i", + start, p->buf, func, line); + return; + } + + p->pkt[index].offset = start - p->buf; + p->pkt[index].len = len; + + p->count++; +} + +/** + * packet_get_do() - Get data range from packet descriptor from given pool + * @p: Packet pool + * @index: Index of packet descriptor in pool + * @offset: Offset of data range in packet descriptor + * @len: Length of desired data range + * @left: Length of available data after range, set on return, can be NULL + * @func: For tracing: name of calling function, NULL means no trace() + * @line: For tracing: caller line of function call + * + * Return: pointer to start of data range, NULL on invalid range or descriptor + */ +void *packet_get_do(struct pool *p, size_t index, size_t offset, size_t len, + size_t *left, const char *func, const int line) +{ + if (index > p->size || index > p->count) { + if (func) { + trace("packet %lu from pool size: %lu, count: %lu, " + "%s:%i", index, p->size, p->count, func, line); + } + return NULL; + } + + if (len > UINT16_MAX || len + offset > UINT32_MAX) { + if (func) { + trace("packet data length %lu, offset %lu, %s:%i", + len, offset, func, line); + } + return NULL; + } + + if (p->pkt[index].offset + len + offset > p->buf_size) { + if (func) { + trace("packet offset plus length %lu from size %lu, " + "%s:%i", p->pkt[index].offset + len + offset, + p->buf_size, func, line); + } + return NULL; + } + + if (len + offset > p->pkt[index].len) { + if (func) { + trace("data length %lu, offset %lu from length %lu, " + "%s:%i", len, offset, p->pkt[index].len, + func, line); + } + return NULL; + } + + if (left) + *left = p->pkt[index].len - offset - len; + + return p->buf + p->pkt[index].offset + offset; +} + +/** + * pool_flush() - Flush a packet pool + * @p: Pointer to packet pool + */ +void pool_flush(struct pool *p) +{ + p->count = 0; +} diff --git a/packet.h b/packet.h new file mode 100644 index 0000000..ec5f3c6 --- /dev/null +++ b/packet.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: AGPL-3.0-or-later + * Copyright (c) 2022 Red Hat GmbH + * Author: Stefano Brivio <sbrivio(a)redhat.com> + */ + +#ifndef PACKET_H +#define PACKET_H + +/** + * struct desc - Generic offset-based descriptor within buffer + * @offset: Offset of descriptor relative to buffer start, 32-bit limit + * @len: Length of descriptor, host order, 16-bit limit + */ +struct desc { + uint32_t offset; + uint16_t len; +}; + +/** + * struct pool - Generic pool of packets stored in a buffer + * @buf: Buffer storing packet descriptors + * @buf_size: Total size of buffer + * @size: Number of usable descriptors for the pool + * @count: Number of used descriptors for the pool + * @pkt: Descriptors: never actually used with UINT_MAX, see macros below + */ +struct pool { + char *buf; + size_t buf_size; + size_t size; + size_t count; + struct desc pkt[UINT_MAX]; +}; + +void packet_add_do(struct pool *p, size_t len, const char *start, + const char *func, const int line); +void *packet_get_do(struct pool *p, size_t index, size_t offset, size_t len, + size_t *left, const char *func, const int line); +void pool_flush(struct pool *p); + +#define packet_add(p, len, start) \ + packet_add_do(p, len, start, __func__, __LINE__); + +#define packet_get(p, index, offset, len, left) \ + packet_get_do(p, index, offset, len, left, __func__, __LINE__); + +#define packet_get_try(p, index, offset, len, left) \ + packet_get_do(p, index, offset, len, left, NULL, 0) + +#define PACKET_POOL_DECL(_name, _size, _buf) \ +struct _name ## _t { \ + char *buf; \ + size_t buf_size; \ + size_t size; \ + size_t count; \ + struct desc pkt[_size]; \ +} + +#define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size) \ +{ \ + .buf_size = _buf_size, \ + .buf = _buf, \ + .size = _size, \ +} + +#define PACKET_POOL(name, size, buf, buf_size) \ + PACKET_POOL_DECL(name, size, buf) name = \ + PACKET_POOL_INIT_NOCAST(size, buf, buf_size) + +#define PACKET_POOL_INIT(name, size, buf, buf_size) \ + (struct name ## _t) PACKET_POOL_INIT_NOCAST(size, buf, buf_size) + +#define PACKET_POOL_P(name, size, buf, buf_size) \ + PACKET_POOL(name ## _pool, size, buf, buf_size); \ + struct pool *name = (struct pool *)&name ## _pool + +#endif /* PACKET_H */ diff --git a/passt.h b/passt.h index 9ea8f8d..cd28973 100644 --- a/passt.h +++ b/passt.h @@ -28,6 +28,7 @@ struct tap_l4_msg { union epoll_ref; +#include "packet.h" #include "icmp.h" #include "tcp.h" #include "udp.h" diff --git a/tap.c b/tap.c index 59a87f9..5fcd7ca 100644 --- a/tap.c +++ b/tap.c @@ -51,10 +51,11 @@ #include "pcap.h" #include "netlink.h" #include "pasta.h" +#include "packet.h" /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ -static struct tap_msg seq4[TAP_MSGS]; -static struct tap_msg seq6[TAP_MSGS]; +static PACKET_POOL_P(pool_tap4, TAP_MSGS, pkt_buf, sizeof(pkt_buf)); +static PACKET_POOL_P(pool_tap6, TAP_MSGS, pkt_buf, sizeof(pkt_buf)); /** * tap_send() - Send frame, with qemu socket header if needed @@ -202,6 +203,8 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, } } +PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); + /** * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4 * @msgs: Count of messages in sequence @@ -212,8 +215,7 @@ void tap_ip_send(struct ctx *c, struct in6_addr *src, uint8_t proto, * @daddr: Destination address * @msg: Array of messages that can be handled in a single call */ -static struct tap_l4_seq4 { - uint16_t msgs; +static struct tap4_l4_t { uint8_t protocol; uint16_t source; @@ -222,8 +224,8 @@ static struct tap_l4_seq4 { uint32_t saddr; uint32_t daddr; - struct tap_l4_msg msg[UIO_MAXIOV]; -} l4_seq4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; + struct pool_l4_t p; +} tap4_l4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; /** * struct l4_seq6_t - Message sequence for one protocol handler call, IPv6 @@ -235,8 +237,7 @@ static struct tap_l4_seq4 { * @daddr: Destination address * @msg: Array of messages that can be handled in a single call */ -static struct tap_l4_seq6 { - uint16_t msgs; +static struct tap6_l4_t { uint8_t protocol; uint16_t source; @@ -245,8 +246,8 @@ static struct tap_l4_seq6 { struct in6_addr saddr; struct in6_addr daddr; - struct tap_l4_msg msg[UIO_MAXIOV]; -} l4_seq6[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; + struct pool_l4_t p; +} tap6_l4[UIO_MAXIOV /* Arbitrary: TAP_MSGS in theory, so limit in users */]; /** * tap_packet_debug() - Print debug message for packet(s) from guest/tap @@ -258,8 +259,8 @@ static struct tap_l4_seq6 { * @count: Count of packets in this sequence */ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h, - struct tap_l4_seq4 *seq4, uint8_t proto6, - struct tap_l4_seq6 *seq6, int count) + struct tap4_l4_t *seq4, uint8_t proto6, + struct tap6_l4_t *seq6, int count) { char buf6s[INET6_ADDRSTRLEN], buf6d[INET6_ADDRSTRLEN]; char buf4s[INET_ADDRSTRLEN], buf4d[INET_ADDRSTRLEN]; @@ -283,14 +284,15 @@ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h, } if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) { - trace("protocol %i from tap: %s:%i -> %s:%i (%i packet%s)", - proto, seq4 ? buf4s : buf6s, + trace("tap: protocol %i, %s%s%s:%i -> %s%s%s:%i (%i packet%s)", + proto, + seq4 ? "" : "[", seq4 ? buf4s : buf6s, seq4 ? "" : "]", ntohs(seq4 ? seq4->source : seq6->source), - seq4 ? buf4d : buf6d, + seq4 ? "" : "[", seq4 ? buf4d : buf6d, seq4 ? "" : "]", ntohs(seq4 ? seq4->dest : seq6->dest), count, count == 1 ? "" : "s"); } else { - trace("protocol %i from tap: %s -> %s (%i packet%s)", + trace("tap: protocol %i, %s -> %s (%i packet%s)", proto, iph ? buf4s : buf6s, iph ? buf4d : buf6d, count, count == 1 ? "" : "s"); } @@ -299,78 +301,83 @@ static void tap_packet_debug(struct iphdr *iph, struct ipv6hdr *ip6h, /** * tap4_handler() - IPv4 and ARP packet handler for tap file descriptor * @c: Execution context - * @msg: Array of messages with IPv4 or ARP protocol - * @count: Count of messages + * @in: Ingress packet pool, packets with Ethernet headers * @now: Current timestamp * * Return: count of packets consumed by handlers */ -static int tap4_handler(struct ctx *c, struct tap_msg *msg, size_t count, - struct timespec *now) +static int tap4_handler(struct ctx *c, struct pool *in, struct timespec *now) { unsigned int i, j, seq_count; - struct tap_l4_msg *l4_msg; - struct tap_l4_seq4 *seq; - size_t len, l4_len; - struct ethhdr *eh; - struct iphdr *iph; - struct udphdr *uh; - char *l4h; + struct tap4_l4_t *seq; - if (!c->v4) - return count; + if (!c->v4 || !in->count) + return in->count; i = 0; resume: - for (seq_count = 0, seq = NULL; i < count; i++) { - eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset); - len = msg[i].len; + for (seq_count = 0, seq = NULL; i < in->count; i++) { + size_t l2_len, l3_len, hlen, l4_len; + struct ethhdr *eh; + struct iphdr *iph; + struct udphdr *uh; + char *l4h; - if (len < sizeof(*eh)) - continue; + packet_get(in, i, 0, 0, &l2_len); - if (ntohs(eh->h_proto) == ETH_P_ARP && arp(c, eh, len)) + eh = packet_get(in, i, 0, sizeof(*eh), &l3_len); + if (!eh) continue; + if (ntohs(eh->h_proto) == ETH_P_ARP) { + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); - if (len < sizeof(*eh) + sizeof(*iph)) + packet_add(pkt, l2_len, (char *)eh); + arp(c, pkt); continue; + } - iph = (struct iphdr *)(eh + 1); - if ((size_t)iph->ihl * 4 + sizeof(*eh) > len) + iph = packet_get(in, i, sizeof(*eh), sizeof(*iph), NULL); + if (!iph) continue; - if ((size_t)iph->ihl * 4 < (int)sizeof(*iph)) + + hlen = iph->ihl * 4UL; + if (hlen < sizeof(*iph) || htons(iph->tot_len) != l3_len || + hlen > l3_len) continue; + l4_len = l3_len - hlen; + if (iph->saddr && c->addr4_seen != iph->saddr) { c->addr4_seen = iph->saddr; proto_update_l2_buf(NULL, NULL, &c->addr4_seen); } - l4h = (char *)iph + (size_t)iph->ihl * 4; - l4_len = len - ((intptr_t)l4h - (intptr_t)eh); + l4h = packet_get(in, i, sizeof(*eh) + hlen, l4_len, NULL); + if (!l4h) + continue; if (iph->protocol == IPPROTO_ICMP) { - struct tap_l4_msg icmp_msg = { l4h - pkt_buf, - l4_len }; + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); - if (l4_len < sizeof(struct icmphdr)) + if (c->no_icmp) continue; - tap_packet_debug(iph, NULL, NULL, 0, NULL, 1); - if (!c->no_icmp) { - icmp_tap_handler(c, AF_INET, &iph->daddr, - &icmp_msg, 1, now); - } + packet_add(pkt, l4_len, l4h); + icmp_tap_handler(c, AF_INET, &iph->daddr, pkt, now); continue; } - if (l4_len < sizeof(*uh)) + uh = packet_get(in, i, sizeof(*eh) + hlen, sizeof(*uh), NULL); + if (!uh) continue; - uh = (struct udphdr *)l4h; + if (iph->protocol == IPPROTO_UDP) { + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); - if (iph->protocol == IPPROTO_UDP && dhcp(c, eh, len)) - continue; + packet_add(pkt, l2_len, (char *)eh); + if (dhcp(c, pkt)) + continue; + } if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_UDP) { @@ -392,147 +399,145 @@ resume: seq->daddr = iph->daddr; \ } while (0) - if (seq && L4_MATCH(iph, uh, seq) && seq->msgs < UIO_MAXIOV) + if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV) goto append; - for (seq = l4_seq4 + seq_count - 1; seq >= l4_seq4; seq--) { + for (seq = tap4_l4 + seq_count - 1; seq >= tap4_l4; seq--) { if (L4_MATCH(iph, uh, seq)) { - if (seq->msgs >= UIO_MAXIOV) + if (seq->p.count >= UIO_MAXIOV) seq = NULL; break; } } - if (!seq || seq < l4_seq4) { - seq = l4_seq4 + seq_count++; + if (!seq || seq < tap4_l4) { + seq = tap4_l4 + seq_count++; L4_SET(iph, uh, seq); - seq->msgs = 0; + pool_flush((struct pool *)&seq->p); } #undef L4_MATCH #undef L4_SET append: - l4_msg = &seq->msg[seq->msgs++]; - - l4_msg->pkt_buf_offset = l4h - pkt_buf; - l4_msg->l4_len = l4_len; + packet_add((struct pool *)&seq->p, l4_len, l4h); if (seq_count == UIO_MAXIOV) break; /* Resume after flushing if i < count */ } - for (j = 0, seq = l4_seq4; j < seq_count; j++, seq++) { - int n = seq->msgs; - - l4_msg = seq->msg; + for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) { + struct pool *p = (struct pool *)&seq->p; + uint32_t *da = &seq->daddr; + size_t n = p->count; tap_packet_debug(NULL, NULL, seq, 0, NULL, n); if (seq->protocol == IPPROTO_TCP) { if (c->no_tcp) continue; - while ((n -= tcp_tap_handler(c, AF_INET, &seq->daddr, - l4_msg, n, now))); + while ((n -= tcp_tap_handler(c, AF_INET, da, p, now))); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; - while ((n -= udp_tap_handler(c, AF_INET, &seq->daddr, - l4_msg, n, now))); + while ((n -= udp_tap_handler(c, AF_INET, da, p, now))); } } - if (i < count) + if (i < in->count) goto resume; - return count; + return in->count; } /** * tap6_handler() - IPv6 packet handler for tap file descriptor * @c: Execution context - * @msg: Array of messages with IPv6 protocol - * @count: Count of messages + * @in: Ingress packet pool, packets with Ethernet headers * @now: Current timestamp * * Return: count of packets consumed by handlers */ -static int tap6_handler(struct ctx *c, struct tap_msg *msg, size_t count, - struct timespec *now) +static int tap6_handler(struct ctx *c, struct pool *in, struct timespec *now) { unsigned int i, j, seq_count = 0; - struct tap_l4_msg *l4_msg; - struct tap_l4_seq6 *seq; - struct ipv6hdr *ip6h; - size_t len, l4_len; - struct ethhdr *eh; - struct udphdr *uh; - uint8_t proto; - char *l4h; + struct tap6_l4_t *seq; - if (!c->v6) - return count; + if (!c->v6 || !in->count) + return in->count; i = 0; resume: - for (seq_count = 0, seq = NULL; i < count; i++) { - eh = (struct ethhdr *)(pkt_buf + msg[i].pkt_buf_offset); - len = msg[i].len; + for (seq_count = 0, seq = NULL; i < in->count; i++) { + size_t l4_len, plen, check; + struct in6_addr *saddr, *daddr; + struct ipv6hdr *ip6h; + struct ethhdr *eh; + struct udphdr *uh; + uint8_t proto; + char *l4h; + + eh = packet_get(in, i, 0, sizeof(*eh), NULL); + if (!eh) + continue; - if (len < sizeof(*eh)) + ip6h = packet_get(in, i, sizeof(*eh), sizeof(*ip6h), &check); + if (!ip6h) continue; - if (len < sizeof(*eh) + sizeof(*ip6h)) - return 1; + saddr = &ip6h->saddr; + daddr = &ip6h->daddr; - ip6h = (struct ipv6hdr *)(eh + 1); + plen = ntohs(ip6h->payload_len); + if (plen != check) + continue; - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) { - c->addr6_ll_seen = ip6h->saddr; + if (!(l4h = ipv6_l4hdr(in, i, sizeof(*eh), &proto, &l4_len))) + continue; + + if (IN6_IS_ADDR_LINKLOCAL(saddr)) { + c->addr6_ll_seen = *saddr; if (IN6_IS_ADDR_UNSPECIFIED(&c->addr6_seen)) { - c->addr6_seen = ip6h->saddr; + c->addr6_seen = *saddr; } } else { - c->addr6_seen = ip6h->saddr; + c->addr6_seen = *saddr; } - if (ntohs(ip6h->payload_len) > - len - sizeof(*eh) - sizeof(*ip6h)) - continue; - - if (!(l4h = ipv6_l4hdr(ip6h, &proto))) - continue; - - l4_len = len - ((intptr_t)l4h - (intptr_t)eh); - if (proto == IPPROTO_ICMPV6) { - struct tap_l4_msg icmpv6_msg = { l4h - pkt_buf, - l4_len }; + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + + if (c->no_icmp) + continue; if (l4_len < sizeof(struct icmp6hdr)) continue; - if (ndp(c, eh, len)) + if (ndp(c, (struct icmp6hdr *)l4h, eh->h_source, saddr)) continue; tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); - if (!c->no_icmp) { - icmp_tap_handler(c, AF_INET6, &ip6h->daddr, - &icmpv6_msg, 1, now); - } + + packet_add(pkt, l4_len, l4h); + icmp_tap_handler(c, AF_INET6, daddr, pkt, now); continue; } if (l4_len < sizeof(*uh)) continue; - uh = (struct udphdr *)l4h; - if (proto == IPPROTO_UDP && dhcpv6(c, eh, len)) - continue; + if (proto == IPPROTO_UDP) { + PACKET_POOL_P(pkt, 1, in->buf, sizeof(pkt_buf)); + + packet_add(pkt, l4_len, l4h); - ip6h->saddr = c->addr6; + if (dhcpv6(c, pkt, saddr, daddr)) + continue; + } + + *saddr = c->addr6; if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) { tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); @@ -542,73 +547,68 @@ resume: #define L4_MATCH(ip6h, proto, uh, seq) \ (seq->protocol == proto && \ seq->source == uh->source && seq->dest == uh->dest && \ - IN6_ARE_ADDR_EQUAL(&seq->saddr, &ip6h->saddr) && \ - IN6_ARE_ADDR_EQUAL(&seq->daddr, &ip6h->daddr)) + IN6_ARE_ADDR_EQUAL(&seq->saddr, saddr) && \ + IN6_ARE_ADDR_EQUAL(&seq->daddr, daddr)) #define L4_SET(ip6h, proto, uh, seq) \ do { \ seq->protocol = proto; \ seq->source = uh->source; \ seq->dest = uh->dest; \ - seq->saddr = ip6h->saddr; \ - seq->daddr = ip6h->daddr; \ + seq->saddr = *saddr; \ + seq->daddr = *daddr; \ } while (0) if (seq && L4_MATCH(ip6h, proto, uh, seq) && - seq->msgs < UIO_MAXIOV) + seq->p.count < UIO_MAXIOV) goto append; - for (seq = l4_seq6 + seq_count - 1; seq >= l4_seq6; seq--) { + for (seq = tap6_l4 + seq_count - 1; seq >= tap6_l4; seq--) { if (L4_MATCH(ip6h, proto, uh, seq)) { - if (seq->msgs >= UIO_MAXIOV) + if (seq->p.count >= UIO_MAXIOV) seq = NULL; break; } } - if (!seq || seq < l4_seq6) { - seq = l4_seq6 + seq_count++; + if (!seq || seq < tap6_l4) { + seq = tap6_l4 + seq_count++; L4_SET(ip6h, proto, uh, seq); - seq->msgs = 0; + pool_flush((struct pool *)&seq->p); } #undef L4_MATCH #undef L4_SET append: - l4_msg = &seq->msg[seq->msgs++]; - - l4_msg->pkt_buf_offset = l4h - pkt_buf; - l4_msg->l4_len = l4_len; + packet_add((struct pool *)&seq->p, l4_len, l4h); if (seq_count == UIO_MAXIOV) break; /* Resume after flushing if i < count */ } - for (j = 0, seq = l4_seq6; j < seq_count; j++, seq++) { - int n = seq->msgs; - - l4_msg = seq->msg; + for (j = 0, seq = tap6_l4; j < seq_count; j++, seq++) { + struct pool *p = (struct pool *)&seq->p; + struct in6_addr *da = &seq->daddr; + size_t n = p->count; tap_packet_debug(NULL, NULL, NULL, seq->protocol, seq, n); if (seq->protocol == IPPROTO_TCP) { if (c->no_tcp) continue; - while ((n -= tcp_tap_handler(c, AF_INET6, &seq->daddr, - l4_msg, n, now))); + while ((n -= tcp_tap_handler(c, AF_INET6, da, p, now))); } else if (seq->protocol == IPPROTO_UDP) { if (c->no_udp) continue; - while ((n -= udp_tap_handler(c, AF_INET6, &seq->daddr, - l4_msg, n, now))); + while ((n -= udp_tap_handler(c, AF_INET6, da, p, now))); } } - if (i < count) + if (i < in->count) goto resume; - return count; + return in->count; } /** @@ -620,14 +620,16 @@ append: */ static int tap_handler_passt(struct ctx *c, struct timespec *now) { - int seq4_i, seq6_i; struct ethhdr *eh; ssize_t n, rem; char *p; redo: p = pkt_buf; - seq4_i = seq6_i = rem = 0; + rem = 0; + + pool_flush(pool_tap4); + pool_flush(pool_tap6); n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT); if (n < 0) { @@ -673,12 +675,10 @@ redo: switch (ntohs(eh->h_proto)) { case ETH_P_ARP: case ETH_P_IP: - seq4[seq4_i].pkt_buf_offset = p - pkt_buf; - seq4[seq4_i++].len = len; + packet_add(pool_tap4, len, p); break; case ETH_P_IPV6: - seq6[seq6_i].pkt_buf_offset = p - pkt_buf; - seq6[seq6_i++].len = len; + packet_add(pool_tap6, len, p); break; default: break; @@ -689,11 +689,8 @@ next: n -= len; } - if (seq4_i) - tap4_handler(c, seq4, seq4_i, now); - - if (seq6_i) - tap6_handler(c, seq6, seq6_i, now); + tap4_handler(c, pool_tap4, now); + tap6_handler(c, pool_tap6, now); /* We can't use EPOLLET otherwise. */ if (rem) @@ -712,8 +709,10 @@ next: static int tap_handler_pasta(struct ctx *c, struct timespec *now) { ssize_t n = 0, len; - int ret, seq4_i = 0, seq6_i = 0; + int ret; + pool_flush(pool_tap4); + pool_flush(pool_tap6); restart: while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) { struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n); @@ -733,12 +732,10 @@ restart: switch (ntohs(eh->h_proto)) { case ETH_P_ARP: case ETH_P_IP: - seq4[seq4_i].pkt_buf_offset = n; - seq4[seq4_i++].len = len; + packet_add(pool_tap4, len, pkt_buf + n); break; case ETH_P_IPV6: - seq6[seq6_i].pkt_buf_offset = n; - seq6[seq6_i++].len = len; + packet_add(pool_tap6, len, pkt_buf + n); break; default: break; @@ -752,11 +749,8 @@ restart: ret = errno; - if (seq4_i) - tap4_handler(c, seq4, seq4_i, now); - - if (seq6_i) - tap6_handler(c, seq6, seq6_i, now); + tap4_handler(c, pool_tap4, now); + tap6_handler(c, pool_tap6, now); if (len > 0 || ret == EAGAIN) return 0; @@ -920,6 +914,15 @@ static void tap_sock_tun_init(struct ctx *c) */ void tap_sock_init(struct ctx *c) { + int i; + + for (i = 0; i < UIO_MAXIOV; i++) { + tap4_l4[i].p = PACKET_POOL_INIT(pool_l4, UIO_MAXIOV, pkt_buf, + sizeof(pkt_buf)); + tap6_l4[i].p = PACKET_POOL_INIT(pool_l4, UIO_MAXIOV, pkt_buf, + sizeof(pkt_buf)); + } + if (c->fd_tap != -1) { epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); close(c->fd_tap); diff --git a/tcp.c b/tcp.c index e0916e0..a54afe4 100644 --- a/tcp.c +++ b/tcp.c @@ -303,6 +303,9 @@ #define TCP_FRAMES \ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) +#define TCP_FILE_PRESSURE 30 /* % of c->nofile */ +#define TCP_CONN_PRESSURE 30 /* % of c->tcp.conn_count */ + #define TCP_HASH_BUCKET_BITS (TCP_CONN_INDEX_BITS + 1) #define TCP_HASH_TABLE_LOAD 70 /* % */ #define TCP_HASH_TABLE_SIZE (TCP_MAX_CONNS * 100 / \ @@ -440,6 +443,7 @@ struct tcp_conn { #define TCP_MAX_RETRANS ((1U << TCP_RETRANS_BITS) - 1) #define TCP_WS_BITS 4 /* RFC 7323 */ +#define TCP_WS_MAX 14 uint8_t ws_from_tap :TCP_WS_BITS; uint8_t ws_to_tap :TCP_WS_BITS; @@ -476,7 +480,6 @@ struct tcp_conn { uint32_t seq_init_from_tap; }; -#define CONN_IS_CLOSED(conn) (conn->events == CLOSED) #define CONN_IS_CLOSING(conn) \ ((conn->events & ESTABLISHED) && \ (conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD))) @@ -699,7 +702,7 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags) return EPOLLET; if (conn_flags & STALLED) - return EPOLLIN | EPOLLRDHUP | EPOLLET; + return EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET; return EPOLLIN | EPOLLRDHUP; } @@ -733,8 +736,11 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn) .r.p.tcp.tcp.v6 = CONN_V6(conn) }; struct epoll_event ev = { .data.u64 = ref.u64 }; - if (CONN_IS_CLOSED(conn)) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); + if (conn->events == CLOSED) { + if (conn->flags & IN_EPOLL) + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev); + if (conn->timer != -1) + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); return 0; } @@ -745,6 +751,18 @@ static int tcp_epoll_ctl(struct ctx *c, struct tcp_conn *conn) conn->flags |= IN_EPOLL; /* No need to log this */ + if (conn->timer != -1) { + union epoll_ref ref_t = { .r.proto = IPPROTO_TCP, + .r.s = conn->sock, + .r.p.tcp.tcp.timer = 1, + .r.p.tcp.tcp.index = conn - tc }; + struct epoll_event ev_t = { .data.u64 = ref_t.u64, + .events = EPOLLIN | EPOLLET }; + + if (epoll_ctl(c->epollfd, EPOLL_CTL_MOD, conn->timer, &ev_t)) + return -errno; + } + return 0; } @@ -759,6 +777,9 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn) { struct itimerspec it = { { 0 }, { 0 } }; + if (conn->events == CLOSED) + return; + if (conn->timer == -1) { union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock, @@ -783,15 +804,11 @@ static void tcp_timer_ctl(struct ctx *c, struct tcp_conn *conn) } } - if (conn->events == CLOSED) { - it.it_value.tv_nsec = 1; - } else if (conn->flags & ACK_TO_TAP_DUE) { + if (conn->flags & ACK_TO_TAP_DUE) { it.it_value.tv_nsec = (long)ACK_INTERVAL * 1000 * 1000; } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) it.it_value.tv_sec = SYN_TIMEOUT; - else if (conn->events & TAP_FIN_SENT) - it.it_value.tv_sec = FIN_TIMEOUT; else it.it_value.tv_sec = ACK_TIMEOUT; } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { @@ -834,7 +851,9 @@ static void conn_flag_do(struct ctx *c, struct tcp_conn *conn, if (flag == STALLED || flag == ~STALLED) tcp_epoll_ctl(c, conn); - if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE) + if (flag == ACK_FROM_TAP_DUE || flag == ACK_TO_TAP_DUE || + (flag == ~ACK_FROM_TAP_DUE && (conn->flags & ACK_TO_TAP_DUE)) || + (flag == ~ACK_TO_TAP_DUE && (conn->flags & ACK_FROM_TAP_DUE))) tcp_timer_ctl(c, conn); } @@ -888,7 +907,7 @@ static void conn_event_do(struct ctx *c, struct tcp_conn *conn, else tcp_epoll_ctl(c, conn); - if (event == CLOSED || CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) + if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) tcp_timer_ctl(c, conn); } @@ -1182,36 +1201,32 @@ static void tcp_sock6_iov_init(void) /** * tcp_opt_get() - Get option, and value if any, from TCP header - * @th: Pointer to TCP header - * @len: Length of buffer, including TCP header + * @opts: Pointer to start of TCP options in header + * @len: Length of buffer, excluding TCP header -- NOT checked here! * @type_find: Option type to look for * @optlen_set: Optional, filled with option length if passed * @value_set: Optional, set to start of option value if passed * * Return: option value, meaningful for up to 4 bytes, -1 if not found */ -static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find, +static int tcp_opt_get(char *opts, size_t len, uint8_t type_find, uint8_t *optlen_set, char **value_set) { uint8_t type, optlen; - char *p; - - if (len > (size_t)th->doff * 4) - len = (size_t)th->doff * 4; - len -= sizeof(*th); - p = (char *)(th + 1); + if (!len) + return -1; - for (; len >= 2; p += optlen, len -= optlen) { - switch (*p) { + for (; len >= 2; opts += optlen, len -= optlen) { + switch (*opts) { case OPT_EOL: return -1; case OPT_NOP: optlen = 1; break; default: - type = *(p++); - optlen = *(p++) - 2; + type = *(opts++); + optlen = *(opts++) - 2; len -= 2; if (type != type_find) @@ -1220,17 +1235,17 @@ static int tcp_opt_get(struct tcphdr *th, size_t len, uint8_t type_find, if (optlen_set) *optlen_set = optlen; if (value_set) - *value_set = p; + *value_set = opts; switch (optlen) { case 0: return 0; case 1: - return *p; + return *opts; case 2: - return ntohs(*(uint16_t *)p); + return ntohs(*(uint16_t *)opts); default: - return ntohl(*(uint32_t *)p); + return ntohl(*(uint32_t *)opts); } } } @@ -1415,12 +1430,12 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) if ((hole - tc) == --c->tcp.conn_count) { debug("TCP: hash table compaction: index %i (%p) was max index", hole - tc, hole); + memset(hole, 0, sizeof(*hole)); return; } from = CONN(c->tcp.conn_count); memcpy(hole, from, sizeof(*hole)); - from->flags = from->events = 0; to = hole; tcp_hash_update(from, to); @@ -1430,25 +1445,23 @@ static void tcp_table_compact(struct ctx *c, struct tcp_conn *hole) debug("TCP: hash table compaction: old index %i, new index %i, " "sock %i, from: %p, to: %p", from - tc, to - tc, from->sock, from, to); + + memset(from, 0, sizeof(*from)); } /** - * tcp_conn_destroy() - Close connection, drop from epoll file descriptor + * tcp_conn_destroy() - Close sockets, trigger hash table removal and compaction * @c: Execution context * @conn: Connection pointer */ static void tcp_conn_destroy(struct ctx *c, struct tcp_conn *conn) { - if (CONN_IS_CLOSED(conn)) - return; - - conn_event(c, conn, CLOSED); - conn->flags = 0; close(conn->sock); + if (conn->timer != -1) + close(conn->timer); - /* Removal from hash table and connection table compaction deferred to - * timer. - */ + tcp_hash_remove(conn); + tcp_table_compact(c, conn); } static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn); @@ -1582,9 +1595,23 @@ static void tcp_l2_data_buf_flush(struct ctx *c) */ void tcp_defer_handler(struct ctx *c) { + int max_conns = c->tcp.conn_count / 100 * TCP_CONN_PRESSURE; + int max_files = c->nofile / 100 * TCP_FILE_PRESSURE; + struct tcp_conn *conn; + tcp_l2_flags_buf_flush(c); tcp_l2_data_buf_flush(c); + tcp_splice_defer_handler(c); + + if (c->tcp.conn_count < MIN(max_files, max_conns)) + return; + + for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) { + if (conn->events == CLOSED) + tcp_conn_destroy(c, conn); + } + } /** @@ -1605,13 +1632,19 @@ static size_t tcp_l2_buf_fill_headers(struct ctx *c, struct tcp_conn *conn, size_t ip_len, eth_len; #define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \ - do { \ - b->th.source = htons(conn->sock_port); \ - b->th.dest = htons(conn->tap_port); \ - b->th.seq = htonl(seq); \ - b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ - b->th.window = htons(MIN(conn->wnd_to_tap, USHRT_MAX)); \ - } while (0) +do { \ + b->th.source = htons(conn->sock_port); \ + b->th.dest = htons(conn->tap_port); \ + b->th.seq = htonl(seq); \ + b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ + if (conn->events & ESTABLISHED) { \ + b->th.window = htons(conn->wnd_to_tap); \ + } else { \ + unsigned wnd = conn->wnd_to_tap << conn->ws_to_tap; \ + \ + b->th.window = htons(MIN(wnd, USHRT_MAX)); \ + } \ +} while (0) if (CONN_V6(conn)) { struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; @@ -1692,7 +1725,7 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, conn->seq_ack_to_tap = prev_ack_to_tap; #else if ((unsigned)SNDBUF_GET(conn) < SNDBUF_SMALL || tcp_rtt_dst_low(conn) - || CONN_IS_CLOSING(conn) || conn->flags & LOCAL || force_seq) { + || CONN_IS_CLOSING(conn) || (conn->flags & LOCAL) || force_seq) { conn->seq_ack_to_tap = conn->seq_from_tap; } else if (conn->seq_ack_to_tap != conn->seq_from_tap) { if (!tinfo) { @@ -1717,12 +1750,13 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, } if (!tinfo) { - if (prev_wnd_to_tap > WINDOW_DEFAULT) + if (prev_wnd_to_tap > WINDOW_DEFAULT) { goto out; - +} tinfo = &tinfo_new; - if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) + if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl)) { goto out; +} } #ifdef HAS_SND_WND @@ -1735,7 +1769,11 @@ static int tcp_update_seqack_wnd(struct ctx *c, struct tcp_conn *conn, } #endif - conn->wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW) >> conn->ws_to_tap; + new_wnd_to_tap = MIN(new_wnd_to_tap, MAX_WINDOW); + if (!(conn->events & ESTABLISHED)) + new_wnd_to_tap = MAX(new_wnd_to_tap, WINDOW_DEFAULT); + + conn->wnd_to_tap = new_wnd_to_tap >> conn->ws_to_tap; if (!conn->wnd_to_tap) conn_flag(c, conn, ACK_TO_TAP_DUE); @@ -1772,10 +1810,15 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) return 0; if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); return -ECONNRESET; } +#ifdef HAS_SND_WND + if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd) + c->tcp.kernel_snd_wnd = 1; +#endif + if (!(conn->flags & LOCAL)) tcp_rtt_dst_check(conn, &tinfo); @@ -1825,11 +1868,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) data += OPT_MSS_LEN - 2; th->doff += OPT_MSS_LEN / 4; -#ifdef HAS_SND_WND - if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd) - c->tcp.kernel_snd_wnd = 1; -#endif - conn->ws_to_tap = MIN(MAX_WS, tinfo.tcpi_snd_wscale); *data++ = OPT_NOP; @@ -1854,10 +1892,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) NULL, conn->seq_to_tap); iov->iov_len = eth_len + sizeof(uint32_t); - /* First value is not scaled: scale now */ - if (flags & SYN) - conn->wnd_to_tap >>= conn->ws_to_tap; - if (CONN_V4(conn)) tcp4_l2_flags_buf_bytes += iov->iov_len; else @@ -1905,68 +1939,55 @@ static int tcp_send_flag(struct ctx *c, struct tcp_conn *conn, int flags) */ static void tcp_rst_do(struct ctx *c, struct tcp_conn *conn) { - if (CONN_IS_CLOSED(conn)) + if (conn->events == CLOSED) return; if (!tcp_send_flag(c, conn, RST)) - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); } /** - * tcp_clamp_window() - Set window and scaling from option, clamp on socket + * tcp_get_tap_ws() - Get Window Scaling option for connection from tap/guest * @conn: Connection pointer - * @th: TCP header, from tap, can be NULL if window is passed - * @len: Buffer length, at L4, can be 0 if no header is passed - * @window: Window value, host order, unscaled, if no header is passed - * @init: Set if this is the very first segment from tap - */ -static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, - struct tcphdr *th, int len, unsigned int window, - int init) + * @opts: Pointer to start of TCP options + * @optlen: Bytes in options: caller MUST ensure available length + */ +static void tcp_get_tap_ws(struct tcp_conn *conn, char *opts, size_t optlen) { - if (init && th) { - int ws = tcp_opt_get(th, len, OPT_WS, NULL, NULL); + int ws = tcp_opt_get(opts, optlen, OPT_WS, NULL, NULL); - conn->ws_from_tap = ws & 0xf; + if (ws >= 0 && ws <= TCP_WS_MAX) + conn->ws_from_tap = ws; + else + conn->ws_from_tap = 0; +} - /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp - * yet, to avoid getting a zero scale just because we set a - * small window now. - */ - conn->wnd_from_tap = ntohs(th->window); - } else { - uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap; +/** + * tcp_clamp_window() - Set new window for connection, clamp on socket + * @c: Execution context + * @conn: Connection pointer + * @window: Window value, host order, unscaled + */ +static void tcp_clamp_window(struct ctx *c, struct tcp_conn *conn, unsigned wnd) +{ + uint32_t prev_scaled = conn->wnd_from_tap << conn->ws_from_tap; - if (th) - window = ntohs(th->window) << conn->ws_from_tap; - else - window <<= conn->ws_from_tap; - - window = MIN(MAX_WINDOW, window); - - if (conn->flags & WND_CLAMPED) { - if (prev_scaled == window) - return; - - /* Discard +/- 1% updates to spare some syscalls. */ - if ((window > prev_scaled && - window * 99 / 100 < prev_scaled) || - (window < prev_scaled && - window * 101 / 100 > prev_scaled)) { - conn->wnd_from_tap = window >> - conn->ws_from_tap; - return; - } - } + wnd <<= conn->ws_from_tap; + wnd = MIN(MAX_WINDOW, wnd); - if (window < 256) - window = 256; + if (conn->flags & WND_CLAMPED) { + if (prev_scaled == wnd) + return; - conn->wnd_from_tap = window >> conn->ws_from_tap; - setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, - &window, sizeof(window)); - conn_flag(c, conn, WND_CLAMPED); + /* Discard +/- 1% updates to spare some syscalls. */ + if ((wnd > prev_scaled && wnd * 99 / 100 < prev_scaled) || + (wnd < prev_scaled && wnd * 101 / 100 > prev_scaled)) + return; } + + conn->wnd_from_tap = wnd >> conn->ws_from_tap; + setsockopt(conn->sock, SOL_TCP, TCP_WINDOW_CLAMP, &wnd, sizeof(wnd)); + conn_flag(c, conn, WND_CLAMPED); } /** @@ -2059,18 +2080,18 @@ static int tcp_conn_new_sock(struct ctx *c, sa_family_t af) * tcp_conn_tap_mss() - Get and clamp MSS value advertised by tap/guest * @c: Execution context * @conn: Connection pointer - * @th: TCP header send by tap/guest - * @len: L4 packet length, host order + * @opts: Pointer to start of TCP options + * @optlen: Bytes in options: caller MUST ensure available length * * Return: clamped MSS value */ static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn, - struct tcphdr *th, size_t len) + char *opts, size_t optlen) { unsigned int mss; int ret; - if ((ret = tcp_opt_get(th, len, OPT_MSS, NULL, NULL)) < 0) + if ((ret = tcp_opt_get(opts, optlen, OPT_MSS, NULL, NULL)) < 0) mss = MSS_DEFAULT; else mss = ret; @@ -2091,12 +2112,13 @@ static uint16_t tcp_conn_tap_mss(struct ctx *c, struct tcp_conn *conn, * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @addr: Remote address, pointer to sin_addr or sin6_addr - * @th: TCP header from tap - * @len: Packet length at L4 + * @th: TCP header from tap: caller MUST ensure it's there + * @opts: Pointer to start of options + * @optlen: Bytes in options: caller MUST ensure available length * @now: Current timestamp */ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, - struct tcphdr *th, size_t len, + struct tcphdr *th, char *opts, size_t optlen, struct timespec *now) { struct sockaddr_in addr4 = { @@ -2142,16 +2164,21 @@ static void tcp_conn_from_tap(struct ctx *c, int af, void *addr, conn = CONN(c->tcp.conn_count++); conn->sock = s; conn->timer = -1; - conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, TAP_SYN_RCVD); conn->wnd_to_tap = WINDOW_DEFAULT; - mss = tcp_conn_tap_mss(c, conn, th, len); + mss = tcp_conn_tap_mss(c, conn, opts, optlen); setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)); MSS_SET(conn, mss); - tcp_clamp_window(c, conn, th, len, 0, 1); + tcp_get_tap_ws(conn, opts, optlen); + + /* RFC 7323, 2.2: first value is not scaled. Also, don't clamp yet, to + * avoid getting a zero scale just because we set a small window now. + */ + if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) + conn->wnd_from_tap = 1; if (af == AF_INET) { sa = (struct sockaddr *)&addr4; @@ -2393,53 +2420,52 @@ zero_len: } /** - * tcp_data_from_tap() - tap data for established connection + * tcp_data_from_tap() - tap/guest data for established connection * @c: Execution context * @conn: Connection pointer - * @msg: Array of messages from tap - * @count: Count of messages + * @p: Pool of TCP packets, with TCP headers * * #syscalls sendmsg */ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, - struct tap_l4_msg *msg, int count) + struct pool *p) { - int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1; + int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; uint16_t max_ack_seq_wnd = conn->wnd_from_tap; uint32_t max_ack_seq = conn->seq_ack_from_tap; uint32_t seq_from_tap = conn->seq_from_tap; struct msghdr mh = { .msg_iov = tcp_iov }; - int partial_send = 0; - uint16_t len; + size_t len; ssize_t n; - for (i = 0, iov_i = 0; i < count; i++) { + for (i = 0, iov_i = 0; i < (int)p->count; i++) { uint32_t seq, seq_offset, ack_seq; struct tcphdr *th; char *data; size_t off; - th = (struct tcphdr *)(pkt_buf + msg[i].pkt_buf_offset); - len = msg[i].l4_len; - - if (len < sizeof(*th)) { + packet_get(p, i, 0, 0, &len); + th = packet_get(p, i, 0, sizeof(*th), NULL); + if (!th) { tcp_rst(c, conn); return; } - off = (size_t)th->doff * 4; + off = th->doff * 4UL; if (off < sizeof(*th) || off > len) { tcp_rst(c, conn); return; } if (th->rst) { - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); return; } len -= off; - data = (char *)th + off; + data = packet_get(p, i, off, len, NULL); + if (!data) + continue; seq = ntohl(th->seq); ack_seq = ntohl(th->ack_seq); @@ -2509,7 +2535,7 @@ static void tcp_data_from_tap(struct ctx *c, struct tcp_conn *conn, i = keep - 1; } - tcp_clamp_window(c, conn, NULL, 0, max_ack_seq_wnd, 0); + tcp_clamp_window(c, conn, max_ack_seq_wnd); if (ack) { if (max_ack_seq == conn->seq_to_tap) { @@ -2591,14 +2617,22 @@ out: * tcp_conn_from_sock_finish() - Complete connection setup after connect() * @c: Execution context * @conn: Connection pointer - * @th: TCP header of SYN, ACK segment from tap/guest - * @len: Packet length of SYN, ACK segment at L4, host order + * @th: TCP header of SYN, ACK segment: caller MUST ensure it's there + * @opts: Pointer to start of options + * @optlen: Bytes in options: caller MUST ensure available length */ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, - struct tcphdr *th, size_t len) + struct tcphdr *th, + char *opts, size_t optlen) { - tcp_clamp_window(c, conn, th, len, 0, 1); - MSS_SET(conn, tcp_conn_tap_mss(c, conn, th, len)); + tcp_clamp_window(c, conn, ntohs(th->window)); + tcp_get_tap_ws(conn, opts, optlen); + + /* First value is not scaled */ + if (!(conn->wnd_from_tap >>= conn->ws_from_tap)) + conn->wnd_from_tap = 1; + + MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen)); conn->seq_init_from_tap = ntohl(th->seq) + 1; conn->seq_from_tap = conn->seq_init_from_tap; @@ -2618,32 +2652,42 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_conn *conn, * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @addr: Destination address - * @msg: Input messages - * @count: Message count + * @p: Pool of TCP packets, with TCP headers * @now: Current timestamp * * Return: count of consumed packets */ -int tcp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now) +int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now) { - struct tcphdr *th = (struct tcphdr *)(pkt_buf + msg[0].pkt_buf_offset); - uint16_t len = msg[0].l4_len; struct tcp_conn *conn; + size_t optlen, len; + struct tcphdr *th; int ack_due = 0; + char *opts; + + packet_get(p, 0, 0, 0, &len); + th = packet_get(p, 0, 0, sizeof(*th), NULL); + if (!th) + return 1; + + optlen = th->doff * 4UL - sizeof(*th); + opts = packet_get(p, 0, sizeof(*th), optlen, NULL); conn = tcp_hash_lookup(c, af, addr, htons(th->source), htons(th->dest)); /* New connection from tap */ if (!conn) { if (th->syn && !th->ack) - tcp_conn_from_tap(c, af, addr, th, len, now); + tcp_conn_from_tap(c, af, addr, th, opts, optlen, now); return 1; } + trace("TCP: packet length %lu from tap for index %lu", len, conn - tc); + if (th->rst) { - tcp_conn_destroy(c, conn); - return count; + conn_event(c, conn, CLOSED); + return p->count; } if (th->ack) { @@ -2656,7 +2700,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, /* Establishing connection from socket */ if (conn->events & SOCK_ACCEPTED) { if (th->syn && th->ack && !th->fin) - tcp_conn_from_sock_finish(c, conn, th, len); + tcp_conn_from_sock_finish(c, conn, th, opts, optlen); else tcp_rst(c, conn); @@ -2667,7 +2711,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if (conn->events & TAP_SYN_RCVD) { if (!(conn->events & TAP_SYN_ACK_SENT)) { tcp_rst(c, conn); - return count; + return p->count; } conn_event(c, conn, ESTABLISHED); @@ -2679,17 +2723,19 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, tcp_send_flag(c, conn, ACK); conn_event(c, conn, SOCK_FIN_SENT); - return count; + return p->count; } if (!th->ack) { tcp_rst(c, conn); - return count; + return p->count; } - tcp_clamp_window(c, conn, th, len, 0, 0); + tcp_clamp_window(c, conn, ntohs(th->window)); + + tcp_data_from_sock(c, conn); - if (count == 1) + if (p->count == 1) return 1; } @@ -2699,13 +2745,13 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if (conn->events & TAP_FIN_RCVD) { if (conn->events & SOCK_FIN_RCVD && conn->seq_ack_from_tap == conn->seq_to_tap) - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); return 1; } /* Established connections accepting data from tap */ - tcp_data_from_tap(c, conn, msg, count); + tcp_data_from_tap(c, conn, p); if (conn->seq_ack_to_tap != conn->seq_from_tap) ack_due = 1; @@ -2719,7 +2765,7 @@ int tcp_tap_handler(struct ctx *c, int af, void *addr, if (ack_due) conn_flag(c, conn, ACK_TO_TAP_DUE); - return count; + return p->count; } /** @@ -2848,27 +2894,18 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref, static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) { struct tcp_conn *conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index); - struct epoll_event ev = { 0 }; if (!conn) return; - if (CONN_IS_CLOSED(conn)) { - tcp_hash_remove(conn); - tcp_table_compact(c, conn); - if (conn->timer != -1) { - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev); - close(conn->timer); - conn->timer = -1; - } - } else if (conn->flags & ACK_TO_TAP_DUE) { + if (conn->flags & ACK_TO_TAP_DUE) { tcp_send_flag(c, conn, ACK_IF_NEEDED); conn_flag(c, conn, ~ACK_TO_TAP_DUE); } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) { debug("TCP: index %i, handshake timeout", conn - tc); tcp_rst(c, conn); - } else if (conn->events & TAP_FIN_SENT) { + } else if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) { debug("TCP: index %i, FIN timeout", conn - tc); tcp_rst(c, conn); } else if (conn->retrans == TCP_MAX_RETRANS) { @@ -2880,6 +2917,7 @@ static void tcp_timer_handler(struct ctx *c, union epoll_ref ref) conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; tcp_data_from_sock(c, conn); + tcp_timer_ctl(c, conn); } } else { struct itimerspec new = { { 0 }, { ACT_TIMEOUT, 0 } }; @@ -2929,19 +2967,22 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, if (!(conn = CONN_OR_NULL(ref.r.p.tcp.tcp.index))) return; + if (conn->events == CLOSED) + return; + if (events & EPOLLERR) { tcp_rst(c, conn); return; } if ((conn->events & TAP_FIN_SENT) && (events & EPOLLHUP)) { - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); return; } if (conn->events & ESTABLISHED) { if (CONN_HAS(conn, SOCK_FIN_SENT | TAP_FIN_ACKED)) - tcp_conn_destroy(c, conn); + conn_event(c, conn, CLOSED); if (events & (EPOLLRDHUP | EPOLLHUP)) conn_event(c, conn, SOCK_FIN_RCVD); @@ -3155,7 +3196,7 @@ static int tcp_sock_refill(void *arg) * * Return: 0 on success, -1 on failure */ -int tcp_sock_init(struct ctx *c, struct timespec *now) +int tcp_sock_init(struct ctx *c) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; int i, port; @@ -3211,7 +3252,6 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) memset(tcp_sock_init_ext, 0xff, sizeof(tcp_sock_init_ext)); memset(tcp_sock_ns, 0xff, sizeof(tcp_sock_ns)); - c->tcp.refill_ts = *now; tcp_sock_refill(&refill_arg); if (c->mode == MODE_PASTA) { @@ -3222,7 +3262,7 @@ int tcp_sock_init(struct ctx *c, struct timespec *now) refill_arg.ns = 1; NS_CALL(tcp_sock_refill, &refill_arg); - c->tcp.port_detect_ts = *now; + tcp_splice_timer(c); } return 0; @@ -3345,47 +3385,48 @@ static int tcp_port_rebind(void *arg) } /** - * tcp_timer() - Scan activity bitmap for sockets waiting for timed events + * tcp_timer() - Periodic tasks: port detection, closed connections, pool refill * @c: Execution context - * @now: Timestamp from caller + * @ts: Unused */ -void tcp_timer(struct ctx *c, struct timespec *now) +void tcp_timer(struct ctx *c, struct timespec *ts) { struct tcp_sock_refill_arg refill_arg = { c, 0 }; + struct tcp_conn *conn; - if (c->mode == MODE_PASTA) { - if (timespec_diff_ms(now, &c->tcp.port_detect_ts) > - PORT_DETECT_INTERVAL) { - struct tcp_port_detect_arg detect_arg = { c, 0 }; - struct tcp_port_rebind_arg rebind_arg = { c, 0 }; - - if (c->tcp.init_detect_ports) { - detect_arg.detect_in_ns = 0; - tcp_port_detect(&detect_arg); - rebind_arg.bind_in_ns = 1; - NS_CALL(tcp_port_rebind, &rebind_arg); - } + (void)ts; - if (c->tcp.ns_detect_ports) { - detect_arg.detect_in_ns = 1; - NS_CALL(tcp_port_detect, &detect_arg); - rebind_arg.bind_in_ns = 0; - tcp_port_rebind(&rebind_arg); - } + if (c->mode == MODE_PASTA) { + struct tcp_port_detect_arg detect_arg = { c, 0 }; + struct tcp_port_rebind_arg rebind_arg = { c, 0 }; + + if (c->tcp.init_detect_ports) { + detect_arg.detect_in_ns = 0; + tcp_port_detect(&detect_arg); + rebind_arg.bind_in_ns = 1; + NS_CALL(tcp_port_rebind, &rebind_arg); + } - c->tcp.port_detect_ts = *now; + if (c->tcp.ns_detect_ports) { + detect_arg.detect_in_ns = 1; + NS_CALL(tcp_port_detect, &detect_arg); + rebind_arg.bind_in_ns = 0; + tcp_port_rebind(&rebind_arg); } + } - tcp_splice_timer(c, now); + for (conn = CONN(c->tcp.conn_count - 1); conn >= tc; conn--) { + if (conn->events == CLOSED) + tcp_conn_destroy(c, conn); } - if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) { - tcp_sock_refill(&refill_arg); - if (c->mode == MODE_PASTA) { - refill_arg.ns = 1; - if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) || - (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0)) - NS_CALL(tcp_sock_refill, &refill_arg); - } + tcp_sock_refill(&refill_arg); + if (c->mode == MODE_PASTA) { + refill_arg.ns = 1; + if ((c->v4 && ns_sock_pool4[TCP_SOCK_POOL_TSH] < 0) || + (c->v6 && ns_sock_pool6[TCP_SOCK_POOL_TSH] < 0)) + NS_CALL(tcp_sock_refill, &refill_arg); + + tcp_splice_timer(c); } } diff --git a/tcp.h b/tcp.h index 109516d..cf52f32 100644 --- a/tcp.h +++ b/tcp.h @@ -6,9 +6,7 @@ #ifndef TCP_H #define TCP_H -#define REFILL_INTERVAL 1000 /* ms */ -#define PORT_DETECT_INTERVAL 1000 -#define TCP_TIMER_INTERVAL MIN(REFILL_INTERVAL, PORT_DETECT_INTERVAL) +#define TCP_TIMER_INTERVAL 1000 /* ms */ #define TCP_CONN_INDEX_BITS 17 /* 128k */ #define TCP_MAX_CONNS (1 << TCP_CONN_INDEX_BITS) @@ -20,10 +18,10 @@ struct ctx; void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now); -int tcp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now); -int tcp_sock_init(struct ctx *c, struct timespec *now); -void tcp_timer(struct ctx *c, struct timespec *now); +int tcp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now); +int tcp_sock_init(struct ctx *c); +void tcp_timer(struct ctx *c, struct timespec *ts); void tcp_defer_handler(struct ctx *c); void tcp_sock_set_bufsize(struct ctx *c, int s); @@ -64,8 +62,6 @@ union tcp_epoll_ref { * @timer_run: Timestamp of most recent timer run * @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035) * @pipe_size: Size of pipes for spliced connections - * @refill_ts: Time of last refill operation for pools of sockets/pipes - * @port_detect_ts: Time of last TCP port detection/rebind, if enabled */ struct tcp_ctx { uint64_t hash_secret[2]; @@ -80,8 +76,6 @@ struct tcp_ctx { int kernel_snd_wnd; #endif size_t pipe_size; - struct timespec refill_ts; - struct timespec port_detect_ts; }; #endif /* TCP_H */ diff --git a/tcp_splice.c b/tcp_splice.c index bcafd33..0095740 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -51,7 +51,7 @@ #define MAX_PIPE_SIZE (2UL * 1024 * 1024) #define TCP_SPLICE_MAX_CONNS (128 * 1024) #define TCP_SPLICE_PIPE_POOL_SIZE 16 -#define REFILL_INTERVAL 1000 /* ms, refill pool of pipes */ +#define TCP_SPLICE_CONN_PRESSURE 30 /* % of splice_conn_count */ #define TCP_SPLICE_FILE_PRESSURE 30 /* % of c->nofile */ /* From tcp.c */ @@ -83,24 +83,24 @@ struct tcp_splice_conn { int pipe_b_a[2]; uint8_t events; -#define SPLICE_CLOSED 0 -#define SPLICE_CONNECT BIT(0) -#define SPLICE_ESTABLISHED BIT(1) -#define SPLICE_A_OUT_WAIT BIT(2) -#define SPLICE_B_OUT_WAIT BIT(3) -#define SPLICE_A_FIN_RCVD BIT(4) -#define SPLICE_B_FIN_RCVD BIT(5) -#define SPLICE_A_FIN_SENT BIT(6) -#define SPLICE_B_FIN_SENT BIT(7) +#define CLOSED 0 +#define CONNECT BIT(0) +#define ESTABLISHED BIT(1) +#define A_OUT_WAIT BIT(2) +#define B_OUT_WAIT BIT(3) +#define A_FIN_RCVD BIT(4) +#define B_FIN_RCVD BIT(5) +#define A_FIN_SENT BIT(6) +#define B_FIN_SENT BIT(7) uint8_t flags; -#define SPLICE_V6 BIT(0) -#define SPLICE_IN_EPOLL BIT(1) -#define SPLICE_RCVLOWAT_SET_A BIT(2) -#define SPLICE_RCVLOWAT_SET_B BIT(3) -#define SPLICE_RCVLOWAT_ACT_A BIT(4) -#define SPLICE_RCVLOWAT_ACT_B BIT(5) -#define SPLICE_CLOSING BIT(6) +#define SOCK_V6 BIT(0) +#define IN_EPOLL BIT(1) +#define RCVLOWAT_SET_A BIT(2) +#define RCVLOWAT_SET_B BIT(3) +#define RCVLOWAT_ACT_A BIT(4) +#define RCVLOWAT_ACT_B BIT(5) +#define CLOSING BIT(6) uint64_t a_read; uint64_t a_written; @@ -108,7 +108,7 @@ struct tcp_splice_conn { uint64_t b_written; }; -#define CONN_V6(x) (x->flags & SPLICE_V6) +#define CONN_V6(x) (x->flags & SOCK_V6) #define CONN_V4(x) (!CONN_V6(x)) #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) #define CONN(index) (tc + (index)) @@ -118,15 +118,13 @@ static struct tcp_splice_conn tc[TCP_SPLICE_MAX_CONNS]; /* Display strings for connection events */ static const char *tcp_splice_event_str[] __attribute((__unused__)) = { - "SPLICE_CONNECT", "SPLICE_ESTABLISHED", - "SPLICE_A_OUT_WAIT", "SPLICE_B_OUT_WAIT", - "SPLICE_A_FIN_RCVD", "SPLICE_B_FIN_RCVD", - "SPLICE_A_FIN_SENT", "SPLICE_B_FIN_SENT", + "CONNECT", "ESTABLISHED", "A_OUT_WAIT", "B_OUT_WAIT", + "A_FIN_RCVD", "B_FIN_RCVD", "A_FIN_SENT", "B_FIN_SENT", }; /* Display strings for connection flags */ static const char *tcp_splice_flag_str[] __attribute((__unused__)) = { - "V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B", + "SOCK_V6", "IN_EPOLL", "RCVLOWAT_SET_A", "RCVLOWAT_SET_B", "RCVLOWAT_ACT_A", "RCVLOWAT_ACT_B", "CLOSING", }; @@ -141,23 +139,27 @@ static void tcp_splice_conn_epoll_events(uint16_t events, { *a = *b = 0; - if (events & SPLICE_CLOSED) + if (events & CLOSED) return; - if (events & SPLICE_ESTABLISHED) - *a = *b = EPOLLIN | EPOLLRDHUP; - else if (events & SPLICE_CONNECT) + if (events & ESTABLISHED) { + if (!(events & B_FIN_SENT)) + *a = EPOLLIN | EPOLLRDHUP; + if (!(events & A_FIN_SENT)) + *b = EPOLLIN | EPOLLRDHUP; + } else if (events & CONNECT) { *b = EPOLLOUT; + } - *a |= (events & SPLICE_A_OUT_WAIT) ? EPOLLOUT : 0; - *b |= (events & SPLICE_B_OUT_WAIT) ? EPOLLOUT : 0; + *a |= (events & A_OUT_WAIT) ? EPOLLOUT : 0; + *b |= (events & B_OUT_WAIT) ? EPOLLOUT : 0; } static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn); /** - * conn_flag_do() - Set/unset given flag, log, update epoll on SPLICE_CLOSING + * conn_flag_do() - Set/unset given flag, log, update epoll on CLOSING flag * @c: Execution context * @conn: Connection pointer * @flag: Flag to set, or ~flag to unset @@ -181,7 +183,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn, tcp_splice_flag_str[fls(flag)]); } - if (flag == SPLICE_CLOSING) + if (flag == CLOSING) tcp_splice_epoll_ctl(c, conn); } @@ -201,7 +203,7 @@ static void conn_flag_do(struct ctx *c, struct tcp_splice_conn *conn, */ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn) { - int m = (conn->flags & SPLICE_IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; + int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD; union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a, .r.p.tcp.tcp.splice = 1, .r.p.tcp.tcp.index = conn - tc, @@ -214,15 +216,8 @@ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn) struct epoll_event ev_b = { .data.u64 = ref_b.u64 }; uint32_t events_a, events_b; - if (conn->flags & SPLICE_CLOSING) { - if (conn->flags & SPLICE_IN_EPOLL) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a); - - if (conn->events & SPLICE_CONNECT) - epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b); - - return 0; - } + if (conn->flags & CLOSING) + goto delete; tcp_splice_conn_epoll_events(conn->events, &events_a, &events_b); ev_a.events = events_a; @@ -230,13 +225,13 @@ static int tcp_splice_epoll_ctl(struct ctx *c, struct tcp_splice_conn *conn) if (epoll_ctl(c->epollfd, m, conn->a, &ev_a) || epoll_ctl(c->epollfd, m, conn->b, &ev_b)) - goto err; + goto delete; - conn->flags |= SPLICE_IN_EPOLL; /* No need to log this */ + conn->flags |= IN_EPOLL; /* No need to log this */ return 0; -err: +delete: epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->a, &ev_a); epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->b, &ev_b); return -errno; @@ -251,12 +246,6 @@ err: static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn, unsigned long event) { - if (event == SPLICE_CLOSED) { - conn->events = SPLICE_CLOSED; - debug("TCP (spliced): index %i, CLOSED", conn - tc); - return; - } - if (event & (event - 1)) { if (!(conn->events & ~event)) return; @@ -274,7 +263,7 @@ static void conn_event_do(struct ctx *c, struct tcp_splice_conn *conn, } if (tcp_splice_epoll_ctl(c, conn)) - conn_flag(c, conn, SPLICE_CLOSING); + conn_flag(c, conn, CLOSING); } #define conn_event(c, conn, event) \ @@ -304,22 +293,25 @@ static void tcp_table_splice_compact(struct ctx *c, memcpy(hole, move, sizeof(*hole)); move->a = move->b = -1; - move->flags = move->events = 0; move->a_read = move->a_written = move->b_read = move->b_written = 0; + move->pipe_a_b[0] = move->pipe_a_b[1] = -1; + move->pipe_b_a[0] = move->pipe_b_a[1] = -1; + move->flags = move->events = 0; debug("TCP (spliced): index %i moved to %i", move - tc, hole - tc); + tcp_splice_epoll_ctl(c, hole); if (tcp_splice_epoll_ctl(c, hole)) - conn_flag(c, hole, SPLICE_CLOSING); + conn_flag(c, hole, CLOSING); } /** - * tcp_splice_destroy() - Close spliced connection and pipes, drop from epoll + * tcp_splice_destroy() - Close spliced connection and pipes, clear * @c: Execution context * @conn: Connection pointer */ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) { - if (conn->events & SPLICE_ESTABLISHED) { + if (conn->events & ESTABLISHED) { /* Flushing might need to block: don't recycle them. */ if (conn->pipe_a_b[0] != -1) { close(conn->pipe_a_b[0]); @@ -333,18 +325,19 @@ static void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn) } } - if (conn->events & SPLICE_CONNECT) { + if (conn->events & CONNECT) { close(conn->b); conn->b = -1; } - conn_event(c, conn, SPLICE_CLOSED); - close(conn->a); conn->a = -1; - conn->flags = 0; conn->a_read = conn->a_written = conn->b_read = conn->b_written = 0; + conn->events = CLOSED; + conn->flags = 0; + debug("TCP (spliced): index %i, CLOSED", conn - tc); + tcp_table_splice_compact(c, conn); } @@ -364,7 +357,7 @@ static int tcp_splice_connect_finish(struct ctx *c, conn->pipe_a_b[1] = conn->pipe_b_a[1] = -1; for (i = 0; i < TCP_SPLICE_PIPE_POOL_SIZE; i++) { - if (splice_pipe_pool[i][0][0] > 0) { + if (splice_pipe_pool[i][0][0] >= 0) { SWAP(conn->pipe_a_b[0], splice_pipe_pool[i][0][0]); SWAP(conn->pipe_a_b[1], splice_pipe_pool[i][0][1]); @@ -377,7 +370,7 @@ static int tcp_splice_connect_finish(struct ctx *c, if (conn->pipe_a_b[0] < 0) { if (pipe2(conn->pipe_a_b, O_NONBLOCK) || pipe2(conn->pipe_b_a, O_NONBLOCK)) { - conn_flag(c, conn, SPLICE_CLOSING); + conn_flag(c, conn, CLOSING); return -EIO; } @@ -385,8 +378,8 @@ static int tcp_splice_connect_finish(struct ctx *c, fcntl(conn->pipe_b_a[0], F_SETPIPE_SZ, c->tcp.pipe_size); } - if (!(conn->events & SPLICE_ESTABLISHED)) - conn_event(c, conn, SPLICE_ESTABLISHED); + if (!(conn->events & ESTABLISHED)) + conn_event(c, conn, ESTABLISHED); return 0; } @@ -450,9 +443,9 @@ static int tcp_splice_connect(struct ctx *c, struct tcp_splice_conn *conn, close(sock_conn); return ret; } - conn_event(c, conn, SPLICE_CONNECT); + conn_event(c, conn, CONNECT); } else { - conn_event(c, conn, SPLICE_ESTABLISHED); + conn_event(c, conn, ESTABLISHED); return tcp_splice_connect_finish(c, conn); } @@ -575,20 +568,23 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, conn = CONN(c->tcp.splice_conn_count++); conn->a = s; - conn->flags = ref.r.p.tcp.tcp.v6 ? SPLICE_V6 : 0; + conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0; if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index)) - conn_flag(c, conn, SPLICE_CLOSING); + conn_flag(c, conn, CLOSING); return; } conn = CONN(ref.r.p.tcp.tcp.index); - if (events & EPOLLERR || events & EPOLLHUP) + if (conn->events == CLOSED) + return; + + if (events & EPOLLERR) goto close; - if (conn->events == SPLICE_CONNECT) { + if (conn->events == CONNECT) { if (!(events & EPOLLOUT)) goto close; if (tcp_splice_connect_finish(c, conn)) @@ -597,9 +593,9 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, if (events & EPOLLOUT) { if (ref.r.s == conn->a) - conn_event(c, conn, ~SPLICE_A_OUT_WAIT); + conn_event(c, conn, ~A_OUT_WAIT); else - conn_event(c, conn, ~SPLICE_B_OUT_WAIT); + conn_event(c, conn, ~B_OUT_WAIT); tcp_splice_dir(conn, ref.r.s, 1, &from, &to, &pipes); } else { @@ -608,9 +604,16 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, if (events & EPOLLRDHUP) { if (ref.r.s == conn->a) - conn_event(c, conn, SPLICE_A_FIN_RCVD); + conn_event(c, conn, A_FIN_RCVD); + else + conn_event(c, conn, B_FIN_RCVD); + } + + if (events & EPOLLHUP) { + if (ref.r.s == conn->a) + conn_event(c, conn, A_FIN_SENT); /* Fake, but implied */ else - conn_event(c, conn, SPLICE_B_FIN_RCVD); + conn_event(c, conn, B_FIN_SENT); } swap: @@ -620,13 +623,13 @@ swap: if (from == conn->a) { seq_read = &conn->a_read; seq_write = &conn->a_written; - lowat_set_flag = SPLICE_RCVLOWAT_SET_A; - lowat_act_flag = SPLICE_RCVLOWAT_ACT_A; + lowat_set_flag = RCVLOWAT_SET_A; + lowat_act_flag = RCVLOWAT_ACT_A; } else { seq_read = &conn->b_read; seq_write = &conn->b_written; - lowat_set_flag = SPLICE_RCVLOWAT_SET_B; - lowat_act_flag = SPLICE_RCVLOWAT_ACT_B; + lowat_set_flag = RCVLOWAT_SET_B; + lowat_act_flag = RCVLOWAT_ACT_B; } while (1) { @@ -636,6 +639,7 @@ swap: retry: readlen = splice(from, NULL, pipes[1], NULL, c->tcp.pipe_size, SPLICE_F_MOVE | SPLICE_F_NONBLOCK); + trace("TCP (spliced): %li from read-side call", readlen); if (readlen < 0) { if (errno == EINTR) goto retry; @@ -660,6 +664,8 @@ retry: eintr: written = splice(pipes[0], NULL, to, NULL, to_write, SPLICE_F_MOVE | more | SPLICE_F_NONBLOCK); + trace("TCP (spliced): %li from write-side call (passed %lu)", + written, to_write); /* Most common case: skip updating counters. */ if (readlen > 0 && readlen == written) { @@ -697,9 +703,9 @@ eintr: goto retry; if (to == conn->a) - conn_event(c, conn, SPLICE_A_OUT_WAIT); + conn_event(c, conn, A_OUT_WAIT); else - conn_event(c, conn, SPLICE_B_OUT_WAIT); + conn_event(c, conn, B_OUT_WAIT); break; } @@ -715,23 +721,21 @@ eintr: break; } - if ( (conn->events & SPLICE_A_FIN_RCVD) && - !(conn->events & SPLICE_B_FIN_SENT)) { - if (*seq_read == *seq_write) { + if ((conn->events & A_FIN_RCVD) && !(conn->events & B_FIN_SENT)) { + if (*seq_read == *seq_write && eof) { shutdown(conn->b, SHUT_WR); - conn_event(c, conn, SPLICE_B_FIN_SENT); + conn_event(c, conn, B_FIN_SENT); } } - if ( (conn->events & SPLICE_B_FIN_RCVD) && - !(conn->events & SPLICE_A_FIN_SENT)) { - if (*seq_read == *seq_write) { + if ((conn->events & B_FIN_RCVD) && !(conn->events & A_FIN_SENT)) { + if (*seq_read == *seq_write && eof) { shutdown(conn->a, SHUT_WR); - conn_event(c, conn, SPLICE_A_FIN_SENT); + conn_event(c, conn, A_FIN_SENT); } } - if (CONN_HAS(conn, SPLICE_A_FIN_SENT | SPLICE_B_FIN_SENT)) + if (CONN_HAS(conn, A_FIN_SENT | B_FIN_SENT)) goto close; if ((events & (EPOLLIN | EPOLLOUT)) == (EPOLLIN | EPOLLOUT)) { @@ -746,10 +750,13 @@ eintr: goto swap; } + if (events & EPOLLHUP) + goto close; + return; close: - conn_flag(c, conn, SPLICE_CLOSING); + conn_flag(c, conn, CLOSING); } /** @@ -829,38 +836,36 @@ void tcp_splice_init(struct ctx *c) /** * tcp_splice_timer() - Timer for spliced connections * @c: Execution context - * @now: Current timestamp */ -void tcp_splice_timer(struct ctx *c, struct timespec *now) +void tcp_splice_timer(struct ctx *c) { struct tcp_splice_conn *conn; for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) { - if (conn->flags & SPLICE_CLOSING) { + if (conn->flags & CLOSING) { tcp_splice_destroy(c, conn); - continue; + return; } - if ( (conn->flags & SPLICE_RCVLOWAT_SET_A) && - !(conn->flags & SPLICE_RCVLOWAT_ACT_A)) { + if ( (conn->flags & RCVLOWAT_SET_A) && + !(conn->flags & RCVLOWAT_ACT_A)) { setsockopt(conn->a, SOL_SOCKET, SO_RCVLOWAT, &((int){ 1 }), sizeof(int)); - conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_A); + conn_flag(c, conn, ~RCVLOWAT_SET_A); } - if ( (conn->flags & SPLICE_RCVLOWAT_SET_B) && - !(conn->flags & SPLICE_RCVLOWAT_ACT_B)) { + if ( (conn->flags & RCVLOWAT_SET_B) && + !(conn->flags & RCVLOWAT_ACT_B)) { setsockopt(conn->b, SOL_SOCKET, SO_RCVLOWAT, &((int){ 1 }), sizeof(int)); - conn_flag(c, conn, ~SPLICE_RCVLOWAT_SET_B); + conn_flag(c, conn, ~RCVLOWAT_SET_B); } - conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_A); - conn_flag(c, conn, ~SPLICE_RCVLOWAT_ACT_B); + conn_flag(c, conn, ~RCVLOWAT_ACT_A); + conn_flag(c, conn, ~RCVLOWAT_ACT_B); } - if (timespec_diff_ms(now, &c->tcp.refill_ts) > REFILL_INTERVAL) - tcp_splice_pipe_refill(c); + tcp_splice_pipe_refill(c); } /** @@ -869,14 +874,15 @@ void tcp_splice_timer(struct ctx *c, struct timespec *now) */ void tcp_splice_defer_handler(struct ctx *c) { + int max_conns = c->tcp.conn_count / 100 * TCP_SPLICE_CONN_PRESSURE; int max_files = c->nofile / 100 * TCP_SPLICE_FILE_PRESSURE; struct tcp_splice_conn *conn; - if (c->tcp.splice_conn_count * 6 < max_files) + if (c->tcp.splice_conn_count < MIN(max_files / 6, max_conns)) return; for (conn = CONN(c->tcp.splice_conn_count - 1); conn >= tc; conn--) { - if (conn->flags & SPLICE_CLOSING) + if (conn->flags & CLOSING) tcp_splice_destroy(c, conn); } } diff --git a/tcp_splice.h b/tcp_splice.h index b744ba7..f7c2f86 100644 --- a/tcp_splice.h +++ b/tcp_splice.h @@ -11,5 +11,5 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref, uint32_t events); void tcp_splice_destroy(struct ctx *c, struct tcp_splice_conn *conn); void tcp_splice_init(struct ctx *c); -void tcp_splice_timer(struct ctx *c, struct timespec *now); +void tcp_splice_timer(struct ctx *c); void tcp_splice_defer_handler(struct ctx *c); diff --git a/udp.c b/udp.c index e22f3ac..9032e47 100644 --- a/udp.c +++ b/udp.c @@ -951,35 +951,35 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, * @c: Execution context * @af: Address family, AF_INET or AF_INET6 * @addr: Destination address - * @msg: Input messages - * @count: Message count + * @p: Pool of UDP packets, with UDP headers * @now: Current timestamp * * Return: count of consumed packets * * #syscalls sendmmsg */ -int udp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now) +int udp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now) { - /* The caller already checks that all the messages have the same source - * and destination, so we can just take those from the first message. - */ - struct udphdr *uh = (struct udphdr *)(pkt_buf + msg[0].pkt_buf_offset); struct mmsghdr mm[UIO_MAXIOV] = { 0 }; struct iovec m[UIO_MAXIOV]; struct sockaddr_in6 s_in6; struct sockaddr_in s_in; struct sockaddr *sa; + int i, s, count = 0; in_port_t src, dst; + struct udphdr *uh; socklen_t sl; - int i, s; (void)c; - if (msg[0].l4_len < sizeof(*uh)) + uh = packet_get(p, 0, 0, sizeof(*uh), NULL); + if (!uh) return 1; + /* The caller already checks that all the messages have the same source + * and destination, so we can just take those from the first message. + */ src = ntohs(uh->source); dst = ntohs(uh->dest); @@ -998,8 +998,8 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, .udp.port = src }; s = sock_l4(c, AF_INET, IPPROTO_UDP, src, 0, uref.u32); - if (s <= 0) - return count; + if (s < 0) + return p->count; udp_tap_map[V4][src].sock = s; bitmap_set(udp_act[V4][UDP_ACT_TAP], src); @@ -1050,8 +1050,8 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, s = sock_l4(c, AF_INET6, IPPROTO_UDP, src, bind_to, uref.u32); - if (s <= 0) - return count; + if (s < 0) + return p->count; udp_tap_map[V6][src].sock = s; bitmap_set(udp_act[V6][UDP_ACT_TAP], src); @@ -1060,18 +1060,26 @@ int udp_tap_handler(struct ctx *c, int af, void *addr, udp_tap_map[V6][src].ts = now->tv_sec; } - for (i = 0; i < count; i++) { + for (i = 0; i < (int)p->count; i++) { struct udphdr *uh_send; + size_t len; + + uh_send = packet_get(p, i, 0, sizeof(*uh), &len); + if (!uh_send) + return p->count; + if (!len) + continue; - uh_send = (struct udphdr *)(msg[i].pkt_buf_offset + pkt_buf); m[i].iov_base = (char *)(uh_send + 1); - m[i].iov_len = msg[i].l4_len - sizeof(*uh_send); + m[i].iov_len = len; mm[i].msg_hdr.msg_name = sa; mm[i].msg_hdr.msg_namelen = sl; mm[i].msg_hdr.msg_iov = m + i; mm[i].msg_hdr.msg_iovlen = 1; + + count++; } count = sendmmsg(s, mm, count, MSG_NOSIGNAL); @@ -1172,13 +1180,11 @@ static void udp_splice_iov_init(void) * * Return: 0 on success, -1 on failure */ -int udp_sock_init(struct ctx *c, struct timespec *now) +int udp_sock_init(struct ctx *c) { union udp_epoll_ref uref = { .udp.bound = 1 }; int dst, s; - (void)now; - for (dst = 0; dst < USHRT_MAX; dst++) { if (!bitmap_isset(c->udp.port_to_tap, dst)) continue; diff --git a/udp.h b/udp.h index 2c9066b..ce40b07 100644 --- a/udp.h +++ b/udp.h @@ -10,9 +10,9 @@ void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, struct timespec *now); -int udp_tap_handler(struct ctx *c, int af, void *addr, - struct tap_l4_msg *msg, int count, struct timespec *now); -int udp_sock_init(struct ctx *c, struct timespec *now); +int udp_tap_handler(struct ctx *c, int af, void *addr, struct pool *p, + struct timespec *now); +int udp_sock_init(struct ctx *c); void udp_timer(struct ctx *c, struct timespec *ts); void udp_update_l2_buf(unsigned char *eth_d, unsigned char *eth_s, const uint32_t *ip_da); diff --git a/util.c b/util.c index 0adc6b9..f16cd61 100644 --- a/util.c +++ b/util.c @@ -38,6 +38,7 @@ #include "util.h" #include "passt.h" +#include "packet.h" /* For __openlog() and __setlogmask() wrappers, and passt_vsyslog() */ static int log_mask; @@ -156,46 +157,59 @@ void passt_vsyslog(int pri, const char *format, va_list ap) send(log_sock, buf, n, 0); } +#define IPV6_NH_OPT(nh) \ + ((nh) == 0 || (nh) == 43 || (nh) == 44 || (nh) == 50 || \ + (nh) == 51 || (nh) == 60 || (nh) == 135 || (nh) == 139 || \ + (nh) == 140 || (nh) == 253 || (nh) == 254) + /** * ipv6_l4hdr() - Find pointer to L4 header in IPv6 packet and extract protocol - * @ip6h: IPv6 header + * @p: Packet pool, packet number @index has IPv6 header at @offset + * @index: Index of packet in pool + * @offset: Pre-calculated IPv6 header offset * @proto: Filled with L4 protocol number + * @dlen: Data length (payload excluding header extensions), set on return * * Return: pointer to L4 header, NULL if not found */ -char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto) +char *ipv6_l4hdr(struct pool *p, int index, size_t offset, uint8_t *proto, + size_t *dlen) { - int offset, len, hdrlen; struct ipv6_opt_hdr *o; + struct ipv6hdr *ip6h; + char *base; + int hdrlen; uint8_t nh; - len = ntohs(ip6h->payload_len); - offset = 0; + base = packet_get(p, index, 0, 0, NULL); + ip6h = packet_get(p, index, offset, sizeof(*ip6h), dlen); + if (!ip6h) + return NULL; - while (offset < len) { - if (!offset) { - nh = ip6h->nexthdr; - hdrlen = sizeof(struct ipv6hdr); - } else { - o = (struct ipv6_opt_hdr *)(((char *)ip6h) + offset); - nh = o->nexthdr; - hdrlen = (o->hdrlen + 1) * 8; - } + offset += sizeof(*ip6h); - if (nh == 59) - return NULL; + nh = ip6h->nexthdr; + if (!IPV6_NH_OPT(nh)) + goto found; + + while ((o = packet_get_try(p, index, offset, sizeof(*o), dlen))) { + nh = o->nexthdr; + hdrlen = (o->hdrlen + 1) * 8; - if (nh == 0 || nh == 43 || nh == 44 || nh == 50 || - nh == 51 || nh == 60 || nh == 135 || nh == 139 || - nh == 140 || nh == 253 || nh == 254) { + if (IPV6_NH_OPT(nh)) offset += hdrlen; - } else { - *proto = nh; - return (char *)(ip6h + 1) + offset; - } + else + goto found; } return NULL; + +found: + if (nh == 59) + return NULL; + + *proto = nh; + return base + offset; } /** diff --git a/util.h b/util.h index 3073f58..073a913 100644 --- a/util.h +++ b/util.h @@ -153,6 +153,8 @@ enum { #include <limits.h> #include <stdarg.h> +#include "packet.h" + enum bind_type { BIND_ANY = 0, BIND_LOOPBACK, @@ -194,7 +196,8 @@ __attribute__ ((weak)) int ffsl(long int i) { return __builtin_ffsl(i); } void __openlog(const char *ident, int option, int facility); void passt_vsyslog(int pri, const char *format, va_list ap); void __setlogmask(int mask); -char *ipv6_l4hdr(struct ipv6hdr *ip6h, uint8_t *proto); +char *ipv6_l4hdr(struct pool *p, int index, size_t offset, uint8_t *proto, + size_t *dlen); int sock_l4(struct ctx *c, int af, uint8_t proto, uint16_t port, enum bind_type bind_addr, uint32_t data); void sock_probe_mem(struct ctx *c); -- 2.35.1