On Fri, 7 Oct 2022 19:57:19 +1100 David Gibson wrote: > On Fri, Oct 07, 2022 at 10:11:45AM +0200, Stefano Brivio wrote: > > On Fri, 7 Oct 2022 17:51:52 +1100 > > David Gibson wrote: > > > > > On Fri, Oct 07, 2022 at 02:47:38AM +0200, Stefano Brivio wrote: > > > > In some environments, such as KubeVirt pods, we might not have a > > > > system logger available. We could choose to run in foreground, but > > > > this takes away the convenient synchronisation mechanism derived from > > > > forking to background when interfaces are ready. > > > > > > > > Add optional logging to file with -l/--log-file and --log-size. > > > > > > > > Unfortunately, this means we need to duplicate features that are more > > > > appropriately implemented by a system logger, such as rotation. Keep > > > > that reasonably simple, by using fallocate() with range collapsing > > > > where supported (Linux kernel >= 3.15, extent-based ext4 and XFS) and > > > > falling back to an unsophisticated block-by-block moving of entries > > > > toward the beginning of the file once we reach the (mandatory) size > > > > limit. > > > > > > > > While at it, clarify the role of LOG_EMERG in passt.c. > > > > > > > > Signed-off-by: Stefano Brivio > > > > --- > > > > Makefile | 2 +- > > > > README.md | 2 +- > > > > conf.c | 49 ++++++++++- > > > > log.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++---- > > > > log.h | 6 ++ > > > > passt.1 | 14 ++- > > > > passt.c | 1 + > > > > 7 files changed, 300 insertions(+), 25 deletions(-) > > > > > > > > diff --git a/Makefile b/Makefile > > > > index 74bbfeb..fafb024 100644 > > > > --- a/Makefile > > > > +++ b/Makefile > > > > @@ -41,7 +41,7 @@ SRCS = $(PASST_SRCS) $(QRAP_SRCS) > > > > MANPAGES = passt.1 pasta.1 qrap.1 > > > > > > > > PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h icmp.h \ > > > > - isolation.h lineread.h ndp.h netlink.h packet.h passt.h pasta.h \ > > > > + isolation.h lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h \ > > > > pcap.h port_fwd.h siphash.h tap.h tcp.h tcp_splice.h udp.h util.h > > > > HEADERS = $(PASST_HEADERS) seccomp.h > > > > > > > > diff --git a/README.md b/README.md > > > > index 2c522b2..3dc4fc5 100644 > > > > --- a/README.md > > > > +++ b/README.md > > > > @@ -289,7 +289,7 @@ speeding up local connections, and usually requiring NAT. _pasta_: > > > > * ✅ all capabilities dropped, other than `CAP_NET_BIND_SERVICE` (if granted) > > > > * ✅ with default options, user, mount, IPC, UTS, PID namespaces are detached > > > > * ✅ no external dependencies (other than a standard C library) > > > > -* ✅ restrictive seccomp profiles (26 syscalls allowed for _passt_, 40 for > > > > +* ✅ restrictive seccomp profiles (30 syscalls allowed for _passt_, 41 for > > > > _pasta_ on x86_64) > > > > > > That's a pity, but I guess we need it. > > > > Yeah :( I started having thoughts about adding some granularity to > > seccomp profiles based on what arguments are passed, but it still looked > > a bit overkill just for this. > > > > > > * ✅ examples of [AppArmor](/passt/tree/contrib/apparmor) and > > > > [SELinux](/passt/tree/contrib/selinux) profiles available > > > > diff --git a/conf.c b/conf.c > > > > index e9cd4a7..f22940b 100644 > > > > --- a/conf.c > > > > +++ b/conf.c > > > > @@ -633,6 +633,9 @@ static void usage(const char *name) > > > > info( " default: run in background if started from a TTY"); > > > > info( " -e, --stderr Log to stderr too"); > > > > info( " default: log to system logger only if started from a TTY"); > > > > + info( " -l, --log-file PATH Log (only) to given file"); > > > > + info( " --log-size BYTES Maximum size of log file"); > > > > + info( " default: 1 MiB"); > > > > info( " --runas UID|UID:GID Run as given UID, GID, which can be"); > > > > info( " numeric, or login and group names"); > > > > info( " default: drop to user \"nobody\""); > > > > @@ -994,6 +997,7 @@ void conf(struct ctx *c, int argc, char **argv) > > > > {"quiet", no_argument, NULL, 'q' }, > > > > {"foreground", no_argument, NULL, 'f' }, > > > > {"stderr", no_argument, NULL, 'e' }, > > > > + {"log-file", required_argument, NULL, 'l' }, > > > > {"help", no_argument, NULL, 'h' }, > > > > {"socket", required_argument, NULL, 's' }, > > > > {"ns-ifname", required_argument, NULL, 'I' }, > > > > @@ -1034,26 +1038,28 @@ void conf(struct ctx *c, int argc, char **argv) > > > > {"no-netns-quit", no_argument, NULL, 10 }, > > > > {"trace", no_argument, NULL, 11 }, > > > > {"runas", required_argument, NULL, 12 }, > > > > + {"log-size", required_argument, NULL, 13 }, > > > > { 0 }, > > > > }; > > > > struct get_bound_ports_ns_arg ns_ports_arg = { .c = c }; > > > > char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; > > > > bool v4_only = false, v6_only = false; > > > > + char *runas = NULL, *logfile = NULL; > > > > struct in6_addr *dns6 = c->ip6.dns; > > > > struct fqdn *dnss = c->dns_search; > > > > uint32_t *dns4 = c->ip4.dns; > > > > int name, ret, mask, b, i; > > > > const char *optstring; > > > > unsigned int ifi = 0; > > > > - char *runas = NULL; > > > > + size_t logsize = 0; > > > > uid_t uid; > > > > gid_t gid; > > > > > > > > if (c->mode == MODE_PASTA) { > > > > c->no_dhcp_dns = c->no_dhcp_dns_search = 1; > > > > - optstring = "dqfehI:p:P:m:a:n:M:g:i:D:S:46t:u:T:U:"; > > > > + optstring = "dqfel:hI:p:P:m:a:n:M:g:i:D:S:46t:u:T:U:"; > > > > } else { > > > > - optstring = "dqfehs:p:P:m:a:n:M:g:i:D:S:46t:u:"; > > > > + optstring = "dqfel:hs:p:P:m:a:n:M:g:i:D:S:46t:u:"; > > > > } > > > > > > > > c->tcp.fwd_in.mode = c->tcp.fwd_out.mode = 0; > > > > @@ -1177,6 +1183,20 @@ void conf(struct ctx *c, int argc, char **argv) > > > > > > > > runas = optarg; > > > > break; > > > > + case 13: > > > > + if (logsize) { > > > > + err("Multiple --log-size options given"); > > > > + usage(argv[0]); > > > > + } > > > > + > > > > + errno = 0; > > > > + logsize = strtol(optarg, NULL, 0); > > > > + > > > > + if (logsize < LOGFILE_SIZE_MIN || errno) { > > > > + err("Invalid --log-size: %s", optarg); > > > > + usage(argv[0]); > > > > + } > > > > + break; > > > > case 'd': > > > > if (c->debug) { > > > > err("Multiple --debug options given"); > > > > @@ -1192,6 +1212,11 @@ void conf(struct ctx *c, int argc, char **argv) > > > > c->foreground = 1; > > > > break; > > > > case 'e': > > > > + if (logfile) { > > > > + err("Can't log to both file and stderr"); > > > > + usage(argv[0]); > > > > + } > > > > + > > > > if (c->stderr) { > > > > err("Multiple --stderr options given"); > > > > usage(argv[0]); > > > > @@ -1199,6 +1224,19 @@ void conf(struct ctx *c, int argc, char **argv) > > > > > > > > c->stderr = 1; > > > > break; > > > > + case 'l': > > > > + if (c->stderr) { > > > > + err("Can't log to both stderr and file"); > > > > + usage(argv[0]); > > > > + } > > > > + > > > > + if (logfile) { > > > > + err("Multiple --log-file options given"); > > > > + usage(argv[0]); > > > > + } > > > > + > > > > + logfile = optarg; > > > > + break; > > > > case 'q': > > > > if (c->quiet) { > > > > err("Multiple --quiet options given"); > > > > @@ -1460,6 +1498,11 @@ void conf(struct ctx *c, int argc, char **argv) > > > > if (ret) > > > > usage(argv[0]); > > > > > > > > + if (logfile) { > > > > + logfile_init(c->mode == MODE_PASST ? "passt" : "pasta", > > > > + logfile, logsize); > > > > + } > > > > + > > > > if (c->mode == MODE_PASTA) { > > > > if (conf_pasta_ns(&netns_only, userns, netns, > > > > optind, argc, argv) < 0) > > > > diff --git a/log.c b/log.c > > > > index 54a7cbb..1e405e3 100644 > > > > --- a/log.c > > > > +++ b/log.c > > > > @@ -12,7 +12,12 @@ > > > > * Author: Stefano Brivio > > > > */ > > > > > > > > +#include > > > > +#include > > > > +#include > > > > +#include > > > > #include > > > > +#include > > > > #include > > > > #include > > > > #include > > > > @@ -20,36 +25,49 @@ > > > > #include > > > > #include > > > > #include > > > > -#include > > > > > > > > #include "log.h" > > > > +#include "util.h" > > > > +#include "passt.h" > > > > > > > > -/* For __openlog() and __setlogmask() wrappers, and passt_vsyslog() */ > > > > -static int log_mask; > > > > -static int log_sock = -1; > > > > -static char log_ident[BUFSIZ]; > > > > -static int log_opt; > > > > -static time_t log_debug_start; > > > > -int log_trace; > > > > +static int log_sock = -1; /* Optional socket to system logger */ > > > > +static char log_ident[BUFSIZ]; /* Identifier string for openlog() */ > > > > +static int log_mask; /* Current log priority mask */ > > > > +static int log_opt; /* Options for openlog() */ > > > > + > > > > +static int log_file = -1; /* Optional log file descriptor */ > > > > +static size_t log_size; /* Maximum log file size in bytes */ > > > > +static size_t log_written; /* Currently used bytes in log file */ > > > > +static size_t log_cut_size; /* Bytes to cut at start on rotation */ > > > > +static char log_header[BUFSIZ]; /* File header, written back on cuts */ > > > > + > > > > +static time_t log_start; /* Start timestamp */ > > > > +int log_trace; /* --trace mode enabled */ > > > > > > > > #define logfn(name, level) \ > > > > void name(const char *format, ...) { \ > > > > struct timespec tp; \ > > > > va_list args; \ > > > > \ > > > > - if (setlogmask(0) & LOG_MASK(LOG_DEBUG)) { \ > > > > + if (setlogmask(0) & LOG_MASK(LOG_DEBUG) && log_file == -1) { \ > > > > clock_gettime(CLOCK_REALTIME, &tp); \ > > > > fprintf(stderr, "%li.%04li: ", \ > > > > - tp.tv_sec - log_debug_start, \ > > > > + tp.tv_sec - log_start, \ > > > > tp.tv_nsec / (100L * 1000)); \ > > > > - } else { \ > > > > + } \ > > > > + \ > > > > + if ((LOG_MASK(LOG_PRI(level)) & log_mask) || \ > > > > + setlogmask(0) == LOG_MASK(LOG_EMERG)) { \ > > > > va_start(args, format); \ > > > > - passt_vsyslog(level, format, args); \ > > > > + if (log_file != -1) \ > > > > + logfile_write(level, format, args); \ > > > > + else if (!(setlogmask(0) & LOG_MASK(LOG_DEBUG))) \ > > > > + passt_vsyslog(level, format, args); \ > > > > va_end(args); \ > > > > } \ > > > > \ > > > > - if (setlogmask(0) & LOG_MASK(LOG_DEBUG) || \ > > > > - setlogmask(0) == LOG_MASK(LOG_EMERG)) { \ > > > > + if ((setlogmask(0) & LOG_MASK(LOG_DEBUG) || \ > > > > + setlogmask(0) == LOG_MASK(LOG_EMERG)) && log_file == -1) { \ > > > > va_start(args, format); \ > > > > (void)vfprintf(stderr, format, args); \ > > > > va_end(args); \ > > > > @@ -58,6 +76,16 @@ void name(const char *format, ...) { \ > > > > } \ > > > > } > > > > > > > > +/* Prefixes for log file messages, indexed by priority */ > > > > +const char *logfile_prefix[] = { > > > > + NULL, NULL, NULL, /* Unused: LOG_EMERG, LOG_ALERT, LOG_CRIT */ > > > > + "ERROR: ", > > > > + "WARNING: ", > > > > + NULL, /* Unused: LOG_NOTICE */ > > > > + "info: ", > > > > + " ", /* LOG_DEBUG */ > > > > +}; > > > > + > > > > logfn(err, LOG_ERR) > > > > logfn(warn, LOG_WARNING) > > > > logfn(info, LOG_INFO) > > > > @@ -79,7 +107,7 @@ void __openlog(const char *ident, int option, int facility) > > > > struct timespec tp; > > > > > > > > clock_gettime(CLOCK_REALTIME, &tp); > > > > - log_debug_start = tp.tv_sec; > > > > + log_start = tp.tv_sec; > > > > > > > > if (log_sock < 0) { > > > > struct sockaddr_un a = { .sun_family = AF_UNIX, }; > > > > @@ -124,9 +152,6 @@ void passt_vsyslog(int pri, const char *format, va_list ap) > > > > char buf[BUFSIZ]; > > > > int n; > > > > > > > > - if (!(LOG_MASK(LOG_PRI(pri)) & log_mask)) > > > > - return; > > > > - > > > > /* Send without name and timestamp, the system logger should add them */ > > > > n = snprintf(buf, BUFSIZ, "<%i> ", pri); > > > > > > > > @@ -141,3 +166,193 @@ void passt_vsyslog(int pri, const char *format, va_list ap) > > > > if (send(log_sock, buf, n, 0) != n) > > > > fprintf(stderr, "Failed to send %i bytes to syslog\n", n); > > > > } > > > > + > > > > +/** > > > > + * logfile_init() - Open log file and write header with PID and path > > > > + * @name: Identifier for header: passt or pasta > > > > + * @path: Path to log file > > > > + * @size: Maximum size of log file: log_cut_size is calculatd here > > > > + */ > > > > +void logfile_init(const char *name, const char *path, size_t size) > > > > +{ > > > > + char nl = '\n', exe[PATH_MAX] = { 0 }; > > > > + int n; > > > > + > > > > + if (readlink("/proc/self/exe", exe, PATH_MAX - 1) < 0) { > > > > + perror("readlink /proc/self/exe"); > > > > + exit(EXIT_FAILURE); > > > > + } > > > > + > > > > + log_file = open(path, O_CREAT | O_TRUNC | O_APPEND | O_RDWR | O_CLOEXEC, > > > > + S_IRUSR | S_IWUSR); > > > > > > Do > > > > ? > > > > > > + if (log_file == -1) { > > > > + err("Couldn't open log file %s: %s", path, strerror(errno)); > > > > + exit(EXIT_FAILURE); > > > > + } > > > > + > > > > + log_size = size ? size : LOGFILE_SIZE_DEFAULT; > > > > + > > > > + n = snprintf(log_header, sizeof(log_header), "%s: %s (%i)", > > > > + name, exe, getpid()); > > > > + > > > > + if (write(log_file, log_header, n) <= 0 || > > > > + write(log_file, &nl, 1) <= 0) { > > > > + perror("Couldn't write to log file\n"); > > > > + exit(EXIT_FAILURE); > > > > + } > > > > + > > > > + /* For FALLOC_FL_COLLAPSE_RANGE: VFS block size can be up to one page */ > > > > + log_cut_size = ROUND_UP(log_size * LOGFILE_CUT_RATIO / 100, PAGE_SIZE); > > > > +} > > > > + > > > > +/** > > > > + * logfile_rotate_fallocate() - Write header, set log_written after fallocate() > > > > + * @fd: Log file descriptor > > > > + * @ts: Current timestamp > > > > + * > > > > + * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek > > > > + */ > > > > +static void logfile_rotate_fallocate(int fd, struct timespec *ts) > > > > +{ > > > > + char buf[BUFSIZ], *nl; > > > > + int n; > > > > + > > > > + if (lseek(fd, 0, SEEK_SET) == -1) > > > > + return; > > > > + if (read(fd, buf, BUFSIZ) == -1) > > > > + return; > > > > + > > > > + n = snprintf(buf, BUFSIZ, > > > > + "%s - log truncated at %li.%04li", log_header, > > > > + ts->tv_sec - log_start, ts->tv_nsec / (100L * 1000)); > > > > + > > > > + /* Avoid partial lines by padding the header with spaces */ > > > > + nl = memchr(buf + n + 1, '\n', BUFSIZ - n - 1); > > > > + if (nl) > > > > + memset(buf + n, ' ', nl - (buf + n)); > > > > + > > > > + if (lseek(fd, 0, SEEK_SET) == -1) > > > > + return; > > > > + if (write(fd, buf, BUFSIZ) == -1) > > > > > > Have you tested this path? I thought with O_APPEND, this write would > > > go at the end, regardless of the lseek() above. > > > > With O_APPEND, it would, that's why the caller drops it before we enter > > this function. > > Duh, sorry. I completely missed those fcntl()s. > > > > > + return; > > > > + > > > > + log_written -= log_cut_size; > > > > +} > > > > + > > > > +/** > > > > + * logfile_rotate_move() - Fallback: move recent entries toward start, then cut > > > > + * @fd: Log file descriptor > > > > + * @ts: Current timestamp > > > > + * > > > > + * #syscalls lseek ppc64le:_llseek ppc64:_llseek armv6l:_llseek armv7l:_llseek > > > > + * #syscalls ftruncate > > > > + */ > > > > +static void logfile_rotate_move(int fd, struct timespec *ts) > > > > +{ > > > > + int header_len, write_offset, end, discard, n; > > > > + char buf[BUFSIZ], *nl; > > > > + > > > > + header_len = snprintf(buf, BUFSIZ, > > > > + "%s - log truncated at %li.%04li\n", log_header, > > > > + ts->tv_sec - log_start, > > > > + ts->tv_nsec / (100L * 1000)); > > > > > > A common helper for the truncation header seems like it would be a > > > good idea. > > > > There's a newline here, but not in the other case, because here we > > don't need any padding, given that we're not affected by the alignment > > of FALLOC_FL_COLLAPSE_RANGE. > > Ah, ok. > > > I tried to have this in a common path, and it invariably leads to > > splitting into two writes (header plus newline, or header plus > > padding), which looks worse than the alternative... > > > > > > + if (lseek(fd, 0, SEEK_SET) == -1) > > > > + return; > > > > + if (write(fd, buf, header_len) == -1) > > > > + return; > > > > + > > > > + end = write_offset = header_len; > > > > + discard = log_cut_size + header_len; > > > > + > > > > + /* Try to cut cleanly at newline */ > > > > + if (lseek(fd, discard, SEEK_SET) == -1) > > > > + goto out; > > > > + if ((n = read(fd, buf, BUFSIZ)) <= 0) > > > > + goto out; > > > > + if ((nl = memchr(buf, '\n', n))) > > > > + discard += (nl - buf) + 1; > > > > + > > > > + /* Go to first block to be moved */ > > > > + if (lseek(fd, discard, SEEK_SET) == -1) > > > > + goto out; > > > > + > > > > + while ((n = read(fd, buf, BUFSIZ)) > 0) { > > > > + end = header_len; > > > > > > Hrm.. so errors in this section will throw away everything except the header. > > > > > > > + > > > > + if (lseek(fd, write_offset, SEEK_SET) == -1) > > > > + goto out; > > > > + if ((n = write(fd, buf, n)) == -1) > > > > + goto out; > > > > + write_offset += n; > > > > + > > > > + if ((n = lseek(fd, 0, SEEK_CUR)) == -1) > > > > + goto out; > > > > + end = n; > > > > > > But errors after here will just throw away what we haven't moved so > > > far. Seems a bit inconsistent. > > > > Um, right, I should move this assignment to the end of the loop, after > > the lseek() below. > > > > > > + > > > > + if (lseek(fd, discard - header_len, SEEK_CUR) == -1) > > > > + goto out; > > > > + } > > > > > > I wonder if those whole function would be made simpler using pread() > > > and pwrite() instead of lseek()s all over the place. > > > > Yes, definitely, but I'm afraid we need to keep at least a couple of > > lseek()s too (see BUGS in the man page of pread() implying I need to > > switch O_APPEND on and off), and that would mean enabling three > > additional syscalls, instead of just one. > > > > And even if I manage to drop all the lseek()s, would this be worth two > > additional syscalls instead of one? I'm not sure... > > > > > > +out: > > > > + if (ftruncate(fd, end)) > > > > + return; > > > > + > > > > + log_written = end; > > > > +} > > > > + > > > > +/** > > > > + * logfile_rotate() - "Rotate" log file once it's full > > > > + * @fd: Log file descriptor > > > > + * @ts: Current timestamp > > > > + * > > > > + * Return: 0 on success, negative error code on failure > > > > + * > > > > + * #syscalls fallocate fcntl > > > > + */ > > > > +static int logfile_rotate(int fd, struct timespec *ts) > > > > +{ > > > > + if (fcntl(fd, F_SETFL, O_RDWR /* Drop O_APPEND: explicit lseek() */)) > > > > + return -errno; > > > > + > > > > + /* Collapsing only for Linux kernel >= 3.15, extent-based ext4 or XFS */ > > > > + if (!fallocate(fd, FALLOC_FL_COLLAPSE_RANGE, 0, log_cut_size)) > > > > + logfile_rotate_fallocate(fd, ts); > > > > + else > > > > + logfile_rotate_move(fd, ts); > > > > + > > > > + if (fcntl(fd, F_SETFL, O_RDWR | O_APPEND)) > > > > + return -errno; > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +/** > > > > + * logfile_write() - Write entry to log file, trigger rotation if full > > > > + * @pri: Facility and level map, same as priority for vsyslog() > > > > + * @format: Same as vsyslog() format > > > > + * @ap: Same as vsyslog() ap > > > > + */ > > > > +void logfile_write(int pri, const char *format, va_list ap) > > > > +{ > > > > + struct timespec ts; > > > > + char buf[BUFSIZ]; > > > > + int n; > > > > + > > > > + if (clock_gettime(CLOCK_REALTIME, &ts)) > > > > + return; > > > > + > > > > + n = snprintf(buf, BUFSIZ, "%li.%04li: %s", > > > > + ts.tv_sec - log_start, ts.tv_nsec / (100L * 1000), > > > > + logfile_prefix[pri]); > > > > + > > > > + n += vsnprintf(buf + n, BUFSIZ - n, format, ap); > > > > + > > > > + if (format[strlen(format)] != '\n') > > > > + n += snprintf(buf + n, BUFSIZ - n, "\n"); > > > > + > > > > + if ((log_written + n >= log_size) && logfile_rotate(log_file, &ts)) > > > > + return; > > > > + > > > > + if ((n = write(log_file, buf, n)) >= 0) > > > > + log_written += n; > > > > +} > > > > diff --git a/log.h b/log.h > > > > index 70cad6f..ed71cd7 100644 > > > > --- a/log.h > > > > +++ b/log.h > > > > @@ -6,6 +6,10 @@ > > > > #ifndef LOG_H > > > > #define LOG_H > > > > > > > > +#define LOGFILE_SIZE_DEFAULT 1E6 > > > > > > Wouldn't a multiple of the block size make more sense? > > > > In practice, that's already the case for most archs, with this being > > 256 pages in the vast majority of cases. If I make it explicit, though, > > I can't report a default value in the man page, which I feel is an > > important information to have there. > > Uh.. I must be forgetting my obscure C literal syntax. I assumed 1E6 > was 1000000. Whoops, it actually is (of course, it's powers of 10!). I'll change this to (1024 * 1024), even if it doesn't really matter for fallocate() purposes. -- Stefano