On Fri, 7 Oct 2022 19:57:19 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:On Fri, Oct 07, 2022 at 10:11:45AM +0200, Stefano Brivio wrote:Whoops, it actually is (of course, it's powers of 10!). I'll change this to (1024 * 1024), even if it doesn't really matter for fallocate() purposes. -- StefanoOn Fri, 7 Oct 2022 17:51:52 +1100 David Gibson <david(a)gibson.dropbear.id.au> wrote:Duh, sorry. I completely missed those fcntl()s.On Fri, Oct 07, 2022 at 02:47:38AM +0200, Stefano Brivio wrote:Yeah :( I started having thoughts about adding some granularity to seccomp profiles based on what arguments are passed, but it still looked a bit overkill just for this.In some environments, such as KubeVirt pods, we might not have a system logger available. We could choose to run in foreground, but this takes away the convenient synchronisation mechanism derived from forking to background when interfaces are ready. Add optional logging to file with -l/--log-file and --log-size. Unfortunately, this means we need to duplicate features that are more appropriately implemented by a system logger, such as rotation. Keep that reasonably simple, by using fallocate() with range collapsing where supported (Linux kernel >= 3.15, extent-based ext4 and XFS) and falling back to an unsophisticated block-by-block moving of entries toward the beginning of the file once we reach the (mandatory) size limit. While at it, clarify the role of LOG_EMERG in passt.c. Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- Makefile | 2 +- README.md | 2 +- conf.c | 49 ++++++++++- log.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++---- log.h | 6 ++ passt.1 | 14 ++- passt.c | 1 + 7 files changed, 300 insertions(+), 25 deletions(-) diff --git a/Makefile b/Makefile index 74bbfeb..fafb024 100644 --- a/Makefile +++ b/Makefile @@ -41,7 +41,7 @@ SRCS = $(PASST_SRCS) $(QRAP_SRCS) MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h icmp.h \ - isolation.h lineread.h ndp.h netlink.h packet.h passt.h pasta.h \ + isolation.h lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h \ pcap.h port_fwd.h siphash.h tap.h tcp.h tcp_splice.h udp.h util.h HEADERS = $(PASST_HEADERS) seccomp.h diff --git a/README.md b/README.md index 2c522b2..3dc4fc5 100644 --- a/README.md +++ b/README.md @@ -289,7 +289,7 @@ speeding up local connections, and usually requiring NAT. _pasta_: * ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted) * ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached * ✅ no external dependencies (other than a standard C library) -* ✅ restrictive seccomp profiles (26 syscalls allowed for _passt_, 40 for +* ✅ restrictive seccomp profiles (30 syscalls allowed for _passt_, 41 for _pasta_ on x86_64)That's a pity, but I guess we need it.?* ✅ examples of [AppArmor](/passt/tree/contrib/apparmor) and [SELinux](/passt/tree/contrib/selinux) profiles available diff --git a/conf.c b/conf.c index e9cd4a7..f22940b 100644 --- a/conf.c +++ b/conf.c @@ -633,6 +633,9 @@ static void usage(const char *name) info( " default: run in background if started from a TTY"); info( " -e, --stderr Log to stderr too"); info( " default: log to system logger only if started from a TTY"); + info( " -l, --log-file PATH Log (only) to given file"); + info( " --log-size BYTES Maximum size of log file"); + info( " default: 1 MiB"); info( " --runas UID|UID:GID Run as given UID, GID, which can be"); info( " numeric, or login and group names"); info( " default: drop to user \"nobody\""); @@ -994,6 +997,7 @@ void conf(struct ctx *c, int argc, char **argv) {"quiet", no_argument, NULL, 'q' }, {"foreground", no_argument, NULL, 'f' }, {"stderr", no_argument, NULL, 'e' }, + {"log-file", required_argument, NULL, 'l' }, {"help", no_argument, NULL, 'h' }, {"socket", required_argument, NULL, 's' }, {"ns-ifname", required_argument, NULL, 'I' }, @@ -1034,26 +1038,28 @@ void conf(struct ctx *c, int argc, char **argv) {"no-netns-quit", no_argument, NULL, 10 }, {"trace", no_argument, NULL, 11 }, {"runas", required_argument, NULL, 12 }, + {"log-size", required_argument, NULL, 13 }, { 0 }, }; struct get_bound_ports_ns_arg ns_ports_arg = { .c = c }; char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; bool v4_only = false, v6_only = false; + char *runas = NULL, *logfile = NULL; struct in6_addr *dns6 = c->ip6.dns; struct fqdn *dnss = c->dns_search; uint32_t *dns4 = c->ip4.dns; int name, ret, mask, b, i; const char *optstring; unsigned int ifi = 0; - char *runas = NULL; + size_t logsize = 0; uid_t uid; gid_t gid; if (c->mode == MODE_PASTA) { c->no_dhcp_dns = c->no_dhcp_dns_search = 1; - optstring = "dqfehI:p:P:m:a:n:M:g:i:D:S:46t:u:T:U:"; + optstring = "dqfel:hI:p:P:m:a:n:M:g:i:D:S:46t:u:T:U:"; } else { - optstring = "dqfehs:p:P:m:a:n:M:g:i:D:S:46t:u:"; + optstring = "dqfel:hs:p:P:m:a:n:M:g:i:D:S:46t:u:"; } c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = 0; @@ -1177,6 +1183,20 @@ void conf(struct ctx *c, int argc, char **argv) runas = optarg; break; + case 13: + if (logsize) { + err("Multiple --log-size options given"); + usage(argv[0]); + } + + errno = 0; + logsize = strtol(optarg, NULL, 0); + + if (logsize < LOGFILE_SIZE_MIN || errno) { + err("Invalid --log-size: %s", optarg); + usage(argv[0]); + } + break; case 'd': if (c->debug) { err("Multiple --debug options given"); @@ -1192,6 +1212,11 @@ void conf(struct ctx *c, int argc, char **argv) c->foreground = 1; break; case 'e': + if (logfile) { + err("Can't log to both file and stderr"); + usage(argv[0]); + } + if (c->stderr) { err("Multiple --stderr options given"); usage(argv[0]); @@ -1199,6 +1224,19 @@ void conf(struct ctx *c, int argc, char **argv) c->stderr = 1; break; + case 'l': + if (c->stderr) { + err("Can't log to both stderr and file"); + usage(argv[0]); + } + + if (logfile) { + err("Multiple --log-file options given"); + usage(argv[0]); + } + + logfile = optarg; + break; case 'q': if (c->quiet) { err("Multiple --quiet options given"); @@ -1460,6 +1498,11 @@ void conf(struct ctx *c, int argc, char **argv) if (ret) usage(argv[0]); + if (logfile) { + logfile_init(c->mode == MODE_PASST ? "passt" : "pasta", + logfile, logsize); + } + if (c->mode == MODE_PASTA) { if (conf_pasta_ns(&netns_only, userns, netns, optind, argc, argv) < 0) diff --git a/log.c b/log.c index 54a7cbb..1e405e3 100644 --- a/log.c +++ b/log.c @@ -12,7 +12,12 @@ * Author: Stefano Brivio <sbrivio(a)redhat.com> */ +#include <arpa/inet.h> +#include <limits.h> +#include <errno.h> +#include <fcntl.h> #include <stdio.h> +#include <stdint.h> #include <stdlib.h> #include <unistd.h> #include <string.h> @@ -20,36 +25,49 @@ #include <syslog.h> #include <stdarg.h> #include <sys/socket.h> -#include <sys/un.h> #include "log.h" +#include "util.h" +#include "passt.h" -/* For __openlog() and __setlogmask() wrappers, and passt_vsyslog() */ -static int log_mask; -static int log_sock = -1; -static char log_ident[BUFSIZ]; -static int log_opt; -static time_t log_debug_start; -int log_trace; +static int log_sock = -1; /* Optional socket to system logger */ +static char log_ident[BUFSIZ]; /* Identifier string for openlog() */ +static int log_mask; /* Current log priority mask */ +static int log_opt; /* Options for openlog() */ + +static int log_file = -1; /* Optional log file descriptor */ +static size_t log_size; /* Maximum log file size in bytes */ +static size_t log_written; /* Currently used bytes in log file */ +static size_t log_cut_size; /* Bytes to cut at start on rotation */ +static char log_header[BUFSIZ]; /* File header, written back on cuts */ + +static time_t log_start; /* Start timestamp */ +int log_trace; /* --trace mode enabled */ #define logfn(name, level) \ void name(const char *format, ...) { \ struct timespec tp; \ va_list args; \ \ - if (setlogmask(0) & LOG_MASK(LOG_DEBUG)) { \ + if (setlogmask(0) & LOG_MASK(LOG_DEBUG) && log_file == -1) { \ clock_gettime(CLOCK_REALTIME, &tp); \ fprintf(stderr, "%li.%04li: ", \ - tp.tv_sec - log_debug_start, \ + tp.tv_sec - log_start, \ tp.tv_nsec / (100L * 1000)); \ - } else { \ + } \ + \ + if ((LOG_MASK(LOG_PRI(level)) & log_mask) || \ + setlogmask(0) == LOG_MASK(LOG_EMERG)) { \ va_start(args, format); \ - passt_vsyslog(level, format, args); \ + if (log_file != -1) \ + logfile_write(level, format, args); \ + else if (!(setlogmask(0) & LOG_MASK(LOG_DEBUG))) \ + passt_vsyslog(level, format, args); \ va_end(args); \ } \ \ - if (setlogmask(0) & LOG_MASK(LOG_DEBUG) || \ - setlogmask(0) == LOG_MASK(LOG_EMERG)) { \ + if ((setlogmask(0) & LOG_MASK(LOG_DEBUG) || \ + setlogmask(0) == LOG_MASK(LOG_EMERG)) && log_file == -1) { \ va_start(args, format); \ (void)vfprintf(stderr, format, args); \ va_end(args); \ @@ -58,6 +76,16 @@ void name(const char *format, ...) { \ } \ } +/* Prefixes for log file messages, indexed by priority */ +const char *logfile_prefix[] = { + NULL, NULL, NULL, /* Unused: LOG_EMERG, LOG_ALERT, LOG_CRIT */ + "ERROR: ", + "WARNING: ", + NULL, /* Unused: LOG_NOTICE */ + "info: ", + " ", /* LOG_DEBUG */ +}; + logfn(err, LOG_ERR) logfn(warn, LOG_WARNING) logfn(info, LOG_INFO) @@ -79,7 +107,7 @@ void __openlog(const char *ident, int option, int facility) struct timespec tp; clock_gettime(CLOCK_REALTIME, &tp); - log_debug_start = tp.tv_sec; + log_start = tp.tv_sec; if (log_sock < 0) { struct sockaddr_un a = { .sun_family = AF_UNIX, }; @@ -124,9 +152,6 @@ void passt_vsyslog(int pri, const char *format, va_list ap) char buf[BUFSIZ]; int n; - if (!(LOG_MASK(LOG_PRI(pri)) & log_mask)) - return; - /* Send without name and timestamp, the system logger should add them */ n = snprintf(buf, BUFSIZ, "<%i> ", pri); @@ -141,3 +166,193 @@ void passt_vsyslog(int pri, const char *format, va_list ap) if (send(log_sock, buf, n, 0) != n) fprintf(stderr, "Failed to send %i bytes to syslog\n", n); } + +/** + * logfile_init() - Open log file and write header with PID and path + * @name: Identifier for header: passt or pasta + * @path: Path to log file + * @size: Maximum size of log file: log_cut_size is calculatd here + */ +void logfile_init(const char *name, const char *path, size_t size) +{ + char nl = '\n', exe[PATH_MAX] = { 0 }; + int n; + + if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) { + perror("readlink /proc/self/exe"); + exit(EXIT_FAILURE); + } + + log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC, + S_IRUSR | S_IWUSR);DoWith O_APPEND, it would, that's why the caller drops it before we enter this function.+ if (log_file == -1) { + err("Couldn't open log file %s: %s", path, strerror(errno)); + exit(EXIT_FAILURE); + } + + log_size = size ? size : LOGFILE_SIZE_DEFAULT; + + n = snprintf(log_header, sizeof(log_header), "%s: %s (%i)", + name, exe, getpid()); + + if (write(log_file, log_header, n) <= 0 || + write(log_file, &nl, 1) <= 0) { + perror("Couldn't write to log file\n"); + exit(EXIT_FAILURE); + } + + /* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */ + log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE); +} + +/** + * logfile_rotate_fallocate() - Write header, set log_written after fallocate() + * @fd: Log file descriptor + * @ts: Current timestamp + * + * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek + */ +static void logfile_rotate_fallocate(int fd, struct timespec *ts) +{ + char buf[BUFSIZ], *nl; + int n; + + if (lseek(fd, 0, SEEK_SET) == -1) + return; + if (read(fd, buf, BUFSIZ) == -1) + return; + + n = snprintf(buf, BUFSIZ, + "%s - log truncated at %li.%04li", log_header, + ts->tv_sec - log_start, ts->tv_nsec / (100L * 1000)); + + /* Avoid partial lines by padding the header with spaces */ + nl = memchr(buf + n + 1, '\n', BUFSIZ - n - 1); + if (nl) + memset(buf + n, ' ', nl - (buf + n)); + + if (lseek(fd, 0, SEEK_SET) == -1) + return; + if (write(fd, buf, BUFSIZ) == -1)Have you tested this path? I thought with O_APPEND, this write would go at the end, regardless of the lseek() above.Ah, ok.There's a newline here, but not in the other case, because here we don't need any padding, given that we're not affected by the alignment of FALLOC_FL_COLLAPSE_RANGE.+ return; + + log_written -= log_cut_size; +} + +/** + * logfile_rotate_move() - Fallback: move recent entries toward start, then cut + * @fd: Log file descriptor + * @ts: Current timestamp + * + * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek + * #syscalls ftruncate + */ +static void logfile_rotate_move(int fd, struct timespec *ts) +{ + int header_len, write_offset, end, discard, n; + char buf[BUFSIZ], *nl; + + header_len = snprintf(buf, BUFSIZ, + "%s - log truncated at %li.%04li\n", log_header, + ts->tv_sec - log_start, + ts->tv_nsec / (100L * 1000));A common helper for the truncation header seems like it would be a good idea.I tried to have this in a common path, and it invariably leads to splitting into two writes (header plus newline, or header plus padding), which looks worse than the alternative...Uh.. I must be forgetting my obscure C literal syntax. I assumed 1E6 was 1000000.Um, right, I should move this assignment to the end of the loop, after the lseek() below.+ if (lseek(fd, 0, SEEK_SET) == -1) + return; + if (write(fd, buf, header_len) == -1) + return; + + end = write_offset = header_len; + discard = log_cut_size + header_len; + + /* Try to cut cleanly at newline */ + if (lseek(fd, discard, SEEK_SET) == -1) + goto out; + if ((n = read(fd, buf, BUFSIZ)) <= 0) + goto out; + if ((nl = memchr(buf, '\n', n))) + discard += (nl - buf) + 1; + + /* Go to first block to be moved */ + if (lseek(fd, discard, SEEK_SET) == -1) + goto out; + + while ((n = read(fd, buf, BUFSIZ)) > 0) { + end = header_len;Hrm.. so errors in this section will throw away everything except the header.+ + if (lseek(fd, write_offset, SEEK_SET) == -1) + goto out; + if ((n = write(fd, buf, n)) == -1) + goto out; + write_offset += n; + + if ((n = lseek(fd, 0, SEEK_CUR)) == -1) + goto out; + end = n;But errors after here will just throw away what we haven't moved so far. Seems a bit inconsistent.Yes, definitely, but I'm afraid we need to keep at least a couple of lseek()s too (see BUGS in the man page of pread() implying I need to switch O_APPEND on and off), and that would mean enabling three additional syscalls, instead of just one. And even if I manage to drop all the lseek()s, would this be worth two additional syscalls instead of one? I'm not sure...+ + if (lseek(fd, discard - header_len, SEEK_CUR) == -1) + goto out; + }I wonder if those whole function would be made simpler using pread() and pwrite() instead of lseek()s all over the place.In practice, that's already the case for most archs, with this being 256 pages in the vast majority of cases. If I make it explicit, though, I can't report a default value in the man page, which I feel is an important information to have there.+out: + if (ftruncate(fd, end)) + return; + + log_written = end; +} + +/** + * logfile_rotate() - "Rotate" log file once it's full + * @fd: Log file descriptor + * @ts: Current timestamp + * + * Return: 0 on success, negative error code on failure + * + * #syscalls fallocate fcntl + */ +static int logfile_rotate(int fd, struct timespec *ts) +{ + if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */)) + return -errno; + + /* Collapsing only for Linux kernel >= 3.15, extent-based ext4 or XFS */ + if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size)) + logfile_rotate_fallocate(fd, ts); + else + logfile_rotate_move(fd, ts); + + if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND)) + return -errno; + + return 0; +} + +/** + * logfile_write() - Write entry to log file, trigger rotation if full + * @pri: Facility and level map, same as priority for vsyslog() + * @format: Same as vsyslog() format + * @ap: Same as vsyslog() ap + */ +void logfile_write(int pri, const char *format, va_list ap) +{ + struct timespec ts; + char buf[BUFSIZ]; + int n; + + if (clock_gettime(CLOCK_REALTIME, &ts)) + return; + + n = snprintf(buf, BUFSIZ, "%li.%04li: %s", + ts.tv_sec - log_start, ts.tv_nsec / (100L * 1000), + logfile_prefix[pri]); + + n += vsnprintf(buf + n, BUFSIZ - n, format, ap); + + if (format[strlen(format)] != '\n') + n += snprintf(buf + n, BUFSIZ - n, "\n"); + + if ((log_written + n >= log_size) && logfile_rotate(log_file, &ts)) + return; + + if ((n = write(log_file, buf, n)) >= 0) + log_written += n; +} diff --git a/log.h b/log.h index 70cad6f..ed71cd7 100644 --- a/log.h +++ b/log.h @@ -6,6 +6,10 @@ #ifndef LOG_H #define LOG_H +#define LOGFILE_SIZE_DEFAULT 1E6Wouldn't a multiple of the block size make more sense?