On Fri, 7 Oct 2022 17:51:52 +1100 David Gibson wrote: > On Fri, Oct 07, 2022 at 02:47:38AM +0200, Stefano Brivio wrote: > > In some environments, such as KubeVirt pods, we might not have a > > system logger available. We could choose to run in foreground, but > > this takes away the convenient synchronisation mechanism derived from > > forking to background when interfaces are ready. > > > > Add optional logging to file with -l/--log-file and --log-size. > > > > Unfortunately, this means we need to duplicate features that are more > > appropriately implemented by a system logger, such as rotation. Keep > > that reasonably simple, by using fallocate() with range collapsing > > where supported (Linux kernel >= 3.15, extent-based ext4 and XFS) and > > falling back to an unsophisticated block-by-block moving of entries > > toward the beginning of the file once we reach the (mandatory) size > > limit. > > > > While at it, clarify the role of LOG_EMERG in passt.c. > > > > Signed-off-by: Stefano Brivio > > --- > > Makefile | 2 +- > > README.md | 2 +- > > conf.c | 49 ++++++++++- > > log.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++---- > > log.h | 6 ++ > > passt.1 | 14 ++- > > passt.c | 1 + > > 7 files changed, 300 insertions(+), 25 deletions(-) > > > > diff --git a/Makefile b/Makefile > > index 74bbfeb..fafb024 100644 > > --- a/Makefile > > +++ b/Makefile > > @@ -41,7 +41,7 @@ SRCS = $(PASST_SRCS) $(QRAP_SRCS) > > MANPAGES = passt.1 pasta.1 qrap.1 > > > > PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h icmp.h \ > > - isolation.h lineread.h ndp.h netlink.h packet.h passt.h pasta.h \ > > + isolation.h lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h \ > > pcap.h port_fwd.h siphash.h tap.h tcp.h tcp_splice.h udp.h util.h > > HEADERS = $(PASST_HEADERS) seccomp.h > > > > diff --git a/README.md b/README.md > > index 2c522b2..3dc4fc5 100644 > > --- a/README.md > > +++ b/README.md > > @@ -289,7 +289,7 @@ speeding up local connections, and usually requiring NAT. _pasta_: > > * ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted) > > * ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached > > * ✅ no external dependencies (other than a standard C library) > > -* ✅ restrictive seccomp profiles (26 syscalls allowed for _passt_, 40 for > > +* ✅ restrictive seccomp profiles (30 syscalls allowed for _passt_, 41 for > > _pasta_ on x86_64) > > That's a pity, but I guess we need it. Yeah :( I started having thoughts about adding some granularity to seccomp profiles based on what arguments are passed, but it still looked a bit overkill just for this. > > * ✅ examples of [AppArmor](/passt/tree/contrib/apparmor) and > > [SELinux](/passt/tree/contrib/selinux) profiles available > > diff --git a/conf.c b/conf.c > > index e9cd4a7..f22940b 100644 > > --- a/conf.c > > +++ b/conf.c > > @@ -633,6 +633,9 @@ static void usage(const char *name) > > info( " default: run in background if started from a TTY"); > > info( " -e, --stderr Log to stderr too"); > > info( " default: log to system logger only if started from a TTY"); > > + info( " -l, --log-file PATH Log (only) to given file"); > > + info( " --log-size BYTES Maximum size of log file"); > > + info( " default: 1 MiB"); > > info( " --runas UID|UID:GID Run as given UID, GID, which can be"); > > info( " numeric, or login and group names"); > > info( " default: drop to user \"nobody\""); > > @@ -994,6 +997,7 @@ void conf(struct ctx *c, int argc, char **argv) > > {"quiet", no_argument, NULL, 'q' }, > > {"foreground", no_argument, NULL, 'f' }, > > {"stderr", no_argument, NULL, 'e' }, > > + {"log-file", required_argument, NULL, 'l' }, > > {"help", no_argument, NULL, 'h' }, > > {"socket", required_argument, NULL, 's' }, > > {"ns-ifname", required_argument, NULL, 'I' }, > > @@ -1034,26 +1038,28 @@ void conf(struct ctx *c, int argc, char **argv) > > {"no-netns-quit", no_argument, NULL, 10 }, > > {"trace", no_argument, NULL, 11 }, > > {"runas", required_argument, NULL, 12 }, > > + {"log-size", required_argument, NULL, 13 }, > > { 0 }, > > }; > > struct get_bound_ports_ns_arg ns_ports_arg = { .c = c }; > > char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; > > bool v4_only = false, v6_only = false; > > + char *runas = NULL, *logfile = NULL; > > struct in6_addr *dns6 = c->ip6.dns; > > struct fqdn *dnss = c->dns_search; > > uint32_t *dns4 = c->ip4.dns; > > int name, ret, mask, b, i; > > const char *optstring; > > unsigned int ifi = 0; > > - char *runas = NULL; > > + size_t logsize = 0; > > uid_t uid; > > gid_t gid; > > > > if (c->mode == MODE_PASTA) { > > c->no_dhcp_dns = c->no_dhcp_dns_search = 1; > > - optstring = "dqfehI:p:P:m:a:n:M:g:i:D:S:46t:u:T:U:"; > > + optstring = "dqfel:hI:p:P:m:a:n:M:g:i:D:S:46t:u:T:U:"; > > } else { > > - optstring = "dqfehs:p:P:m:a:n:M:g:i:D:S:46t:u:"; > > + optstring = "dqfel:hs:p:P:m:a:n:M:g:i:D:S:46t:u:"; > > } > > > > c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = 0; > > @@ -1177,6 +1183,20 @@ void conf(struct ctx *c, int argc, char **argv) > > > > runas = optarg; > > break; > > + case 13: > > + if (logsize) { > > + err("Multiple --log-size options given"); > > + usage(argv[0]); > > + } > > + > > + errno = 0; > > + logsize = strtol(optarg, NULL, 0); > > + > > + if (logsize < LOGFILE_SIZE_MIN || errno) { > > + err("Invalid --log-size: %s", optarg); > > + usage(argv[0]); > > + } > > + break; > > case 'd': > > if (c->debug) { > > err("Multiple --debug options given"); > > @@ -1192,6 +1212,11 @@ void conf(struct ctx *c, int argc, char **argv) > > c->foreground = 1; > > break; > > case 'e': > > + if (logfile) { > > + err("Can't log to both file and stderr"); > > + usage(argv[0]); > > + } > > + > > if (c->stderr) { > > err("Multiple --stderr options given"); > > usage(argv[0]); > > @@ -1199,6 +1224,19 @@ void conf(struct ctx *c, int argc, char **argv) > > > > c->stderr = 1; > > break; > > + case 'l': > > + if (c->stderr) { > > + err("Can't log to both stderr and file"); > > + usage(argv[0]); > > + } > > + > > + if (logfile) { > > + err("Multiple --log-file options given"); > > + usage(argv[0]); > > + } > > + > > + logfile = optarg; > > + break; > > case 'q': > > if (c->quiet) { > > err("Multiple --quiet options given"); > > @@ -1460,6 +1498,11 @@ void conf(struct ctx *c, int argc, char **argv) > > if (ret) > > usage(argv[0]); > > > > + if (logfile) { > > + logfile_init(c->mode == MODE_PASST ? "passt" : "pasta", > > + logfile, logsize); > > + } > > + > > if (c->mode == MODE_PASTA) { > > if (conf_pasta_ns(&netns_only, userns, netns, > > optind, argc, argv) < 0) > > diff --git a/log.c b/log.c > > index 54a7cbb..1e405e3 100644 > > --- a/log.c > > +++ b/log.c > > @@ -12,7 +12,12 @@ > > * Author: Stefano Brivio > > */ > > > > +#include > > +#include > > +#include > > +#include > > #include > > +#include > > #include > > #include > > #include > > @@ -20,36 +25,49 @@ > > #include > > #include > > #include > > -#include > > > > #include "log.h" > > +#include "util.h" > > +#include "passt.h" > > > > -/* For __openlog() and __setlogmask() wrappers, and passt_vsyslog() */ > > -static int log_mask; > > -static int log_sock = -1; > > -static char log_ident[BUFSIZ]; > > -static int log_opt; > > -static time_t log_debug_start; > > -int log_trace; > > +static int log_sock = -1; /* Optional socket to system logger */ > > +static char log_ident[BUFSIZ]; /* Identifier string for openlog() */ > > +static int log_mask; /* Current log priority mask */ > > +static int log_opt; /* Options for openlog() */ > > + > > +static int log_file = -1; /* Optional log file descriptor */ > > +static size_t log_size; /* Maximum log file size in bytes */ > > +static size_t log_written; /* Currently used bytes in log file */ > > +static size_t log_cut_size; /* Bytes to cut at start on rotation */ > > +static char log_header[BUFSIZ]; /* File header, written back on cuts */ > > + > > +static time_t log_start; /* Start timestamp */ > > +int log_trace; /* --trace mode enabled */ > > > > #define logfn(name, level) \ > > void name(const char *format, ...) { \ > > struct timespec tp; \ > > va_list args; \ > > \ > > - if (setlogmask(0) & LOG_MASK(LOG_DEBUG)) { \ > > + if (setlogmask(0) & LOG_MASK(LOG_DEBUG) && log_file == -1) { \ > > clock_gettime(CLOCK_REALTIME, &tp); \ > > fprintf(stderr, "%li.%04li: ", \ > > - tp.tv_sec - log_debug_start, \ > > + tp.tv_sec - log_start, \ > > tp.tv_nsec / (100L * 1000)); \ > > - } else { \ > > + } \ > > + \ > > + if ((LOG_MASK(LOG_PRI(level)) & log_mask) || \ > > + setlogmask(0) == LOG_MASK(LOG_EMERG)) { \ > > va_start(args, format); \ > > - passt_vsyslog(level, format, args); \ > > + if (log_file != -1) \ > > + logfile_write(level, format, args); \ > > + else if (!(setlogmask(0) & LOG_MASK(LOG_DEBUG))) \ > > + passt_vsyslog(level, format, args); \ > > va_end(args); \ > > } \ > > \ > > - if (setlogmask(0) & LOG_MASK(LOG_DEBUG) || \ > > - setlogmask(0) == LOG_MASK(LOG_EMERG)) { \ > > + if ((setlogmask(0) & LOG_MASK(LOG_DEBUG) || \ > > + setlogmask(0) == LOG_MASK(LOG_EMERG)) && log_file == -1) { \ > > va_start(args, format); \ > > (void)vfprintf(stderr, format, args); \ > > va_end(args); \ > > @@ -58,6 +76,16 @@ void name(const char *format, ...) { \ > > } \ > > } > > > > +/* Prefixes for log file messages, indexed by priority */ > > +const char *logfile_prefix[] = { > > + NULL, NULL, NULL, /* Unused: LOG_EMERG, LOG_ALERT, LOG_CRIT */ > > + "ERROR: ", > > + "WARNING: ", > > + NULL, /* Unused: LOG_NOTICE */ > > + "info: ", > > + " ", /* LOG_DEBUG */ > > +}; > > + > > logfn(err, LOG_ERR) > > logfn(warn, LOG_WARNING) > > logfn(info, LOG_INFO) > > @@ -79,7 +107,7 @@ void __openlog(const char *ident, int option, int facility) > > struct timespec tp; > > > > clock_gettime(CLOCK_REALTIME, &tp); > > - log_debug_start = tp.tv_sec; > > + log_start = tp.tv_sec; > > > > if (log_sock < 0) { > > struct sockaddr_un a = { .sun_family = AF_UNIX, }; > > @@ -124,9 +152,6 @@ void passt_vsyslog(int pri, const char *format, va_list ap) > > char buf[BUFSIZ]; > > int n; > > > > - if (!(LOG_MASK(LOG_PRI(pri)) & log_mask)) > > - return; > > - > > /* Send without name and timestamp, the system logger should add them */ > > n = snprintf(buf, BUFSIZ, "<%i> ", pri); > > > > @@ -141,3 +166,193 @@ void passt_vsyslog(int pri, const char *format, va_list ap) > > if (send(log_sock, buf, n, 0) != n) > > fprintf(stderr, "Failed to send %i bytes to syslog\n", n); > > } > > + > > +/** > > + * logfile_init() - Open log file and write header with PID and path > > + * @name: Identifier for header: passt or pasta > > + * @path: Path to log file > > + * @size: Maximum size of log file: log_cut_size is calculatd here > > + */ > > +void logfile_init(const char *name, const char *path, size_t size) > > +{ > > + char nl = '\n', exe[PATH_MAX] = { 0 }; > > + int n; > > + > > + if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) { > > + perror("readlink /proc/self/exe"); > > + exit(EXIT_FAILURE); > > + } > > + > > + log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC, > > + S_IRUSR | S_IWUSR); > > Do ? > > + if (log_file == -1) { > > + err("Couldn't open log file %s: %s", path, strerror(errno)); > > + exit(EXIT_FAILURE); > > + } > > + > > + log_size = size ? size : LOGFILE_SIZE_DEFAULT; > > + > > + n = snprintf(log_header, sizeof(log_header), "%s: %s (%i)", > > + name, exe, getpid()); > > + > > + if (write(log_file, log_header, n) <= 0 || > > + write(log_file, &nl, 1) <= 0) { > > + perror("Couldn't write to log file\n"); > > + exit(EXIT_FAILURE); > > + } > > + > > + /* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */ > > + log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE); > > +} > > + > > +/** > > + * logfile_rotate_fallocate() - Write header, set log_written after fallocate() > > + * @fd: Log file descriptor > > + * @ts: Current timestamp > > + * > > + * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek > > + */ > > +static void logfile_rotate_fallocate(int fd, struct timespec *ts) > > +{ > > + char buf[BUFSIZ], *nl; > > + int n; > > + > > + if (lseek(fd, 0, SEEK_SET) == -1) > > + return; > > + if (read(fd, buf, BUFSIZ) == -1) > > + return; > > + > > + n = snprintf(buf, BUFSIZ, > > + "%s - log truncated at %li.%04li", log_header, > > + ts->tv_sec - log_start, ts->tv_nsec / (100L * 1000)); > > + > > + /* Avoid partial lines by padding the header with spaces */ > > + nl = memchr(buf + n + 1, '\n', BUFSIZ - n - 1); > > + if (nl) > > + memset(buf + n, ' ', nl - (buf + n)); > > + > > + if (lseek(fd, 0, SEEK_SET) == -1) > > + return; > > + if (write(fd, buf, BUFSIZ) == -1) > > Have you tested this path? I thought with O_APPEND, this write would > go at the end, regardless of the lseek() above. With O_APPEND, it would, that's why the caller drops it before we enter this function. > > + return; > > + > > + log_written -= log_cut_size; > > +} > > + > > +/** > > + * logfile_rotate_move() - Fallback: move recent entries toward start, then cut > > + * @fd: Log file descriptor > > + * @ts: Current timestamp > > + * > > + * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek > > + * #syscalls ftruncate > > + */ > > +static void logfile_rotate_move(int fd, struct timespec *ts) > > +{ > > + int header_len, write_offset, end, discard, n; > > + char buf[BUFSIZ], *nl; > > + > > + header_len = snprintf(buf, BUFSIZ, > > + "%s - log truncated at %li.%04li\n", log_header, > > + ts->tv_sec - log_start, > > + ts->tv_nsec / (100L * 1000)); > > A common helper for the truncation header seems like it would be a > good idea. There's a newline here, but not in the other case, because here we don't need any padding, given that we're not affected by the alignment of FALLOC_FL_COLLAPSE_RANGE. I tried to have this in a common path, and it invariably leads to splitting into two writes (header plus newline, or header plus padding), which looks worse than the alternative... > > + if (lseek(fd, 0, SEEK_SET) == -1) > > + return; > > + if (write(fd, buf, header_len) == -1) > > + return; > > + > > + end = write_offset = header_len; > > + discard = log_cut_size + header_len; > > + > > + /* Try to cut cleanly at newline */ > > + if (lseek(fd, discard, SEEK_SET) == -1) > > + goto out; > > + if ((n = read(fd, buf, BUFSIZ)) <= 0) > > + goto out; > > + if ((nl = memchr(buf, '\n', n))) > > + discard += (nl - buf) + 1; > > + > > + /* Go to first block to be moved */ > > + if (lseek(fd, discard, SEEK_SET) == -1) > > + goto out; > > + > > + while ((n = read(fd, buf, BUFSIZ)) > 0) { > > + end = header_len; > > Hrm.. so errors in this section will throw away everything except the header. > > > + > > + if (lseek(fd, write_offset, SEEK_SET) == -1) > > + goto out; > > + if ((n = write(fd, buf, n)) == -1) > > + goto out; > > + write_offset += n; > > + > > + if ((n = lseek(fd, 0, SEEK_CUR)) == -1) > > + goto out; > > + end = n; > > But errors after here will just throw away what we haven't moved so > far. Seems a bit inconsistent. Um, right, I should move this assignment to the end of the loop, after the lseek() below. > > + > > + if (lseek(fd, discard - header_len, SEEK_CUR) == -1) > > + goto out; > > + } > > I wonder if those whole function would be made simpler using pread() > and pwrite() instead of lseek()s all over the place. Yes, definitely, but I'm afraid we need to keep at least a couple of lseek()s too (see BUGS in the man page of pread() implying I need to switch O_APPEND on and off), and that would mean enabling three additional syscalls, instead of just one. And even if I manage to drop all the lseek()s, would this be worth two additional syscalls instead of one? I'm not sure... > > +out: > > + if (ftruncate(fd, end)) > > + return; > > + > > + log_written = end; > > +} > > + > > +/** > > + * logfile_rotate() - "Rotate" log file once it's full > > + * @fd: Log file descriptor > > + * @ts: Current timestamp > > + * > > + * Return: 0 on success, negative error code on failure > > + * > > + * #syscalls fallocate fcntl > > + */ > > +static int logfile_rotate(int fd, struct timespec *ts) > > +{ > > + if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */)) > > + return -errno; > > + > > + /* Collapsing only for Linux kernel >= 3.15, extent-based ext4 or XFS */ > > + if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size)) > > + logfile_rotate_fallocate(fd, ts); > > + else > > + logfile_rotate_move(fd, ts); > > + > > + if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND)) > > + return -errno; > > + > > + return 0; > > +} > > + > > +/** > > + * logfile_write() - Write entry to log file, trigger rotation if full > > + * @pri: Facility and level map, same as priority for vsyslog() > > + * @format: Same as vsyslog() format > > + * @ap: Same as vsyslog() ap > > + */ > > +void logfile_write(int pri, const char *format, va_list ap) > > +{ > > + struct timespec ts; > > + char buf[BUFSIZ]; > > + int n; > > + > > + if (clock_gettime(CLOCK_REALTIME, &ts)) > > + return; > > + > > + n = snprintf(buf, BUFSIZ, "%li.%04li: %s", > > + ts.tv_sec - log_start, ts.tv_nsec / (100L * 1000), > > + logfile_prefix[pri]); > > + > > + n += vsnprintf(buf + n, BUFSIZ - n, format, ap); > > + > > + if (format[strlen(format)] != '\n') > > + n += snprintf(buf + n, BUFSIZ - n, "\n"); > > + > > + if ((log_written + n >= log_size) && logfile_rotate(log_file, &ts)) > > + return; > > + > > + if ((n = write(log_file, buf, n)) >= 0) > > + log_written += n; > > +} > > diff --git a/log.h b/log.h > > index 70cad6f..ed71cd7 100644 > > --- a/log.h > > +++ b/log.h > > @@ -6,6 +6,10 @@ > > #ifndef LOG_H > > #define LOG_H > > > > +#define LOGFILE_SIZE_DEFAULT 1E6 > > Wouldn't a multiple of the block size make more sense? In practice, that's already the case for most archs, with this being 256 pages in the vast majority of cases. If I make it explicit, though, I can't report a default value in the man page, which I feel is an important information to have there. Also... does it really matter? fallocate(), here, just cares about blocks we drop from the beginning, not blocks we use in total. -- Stefano