On Thu, Feb 05, 2026 at 12:42:09AM +0100, Stefano Brivio wrote: > Draft, very rudimentary. Notably lacking: > > - any form of error checking, documentation > > - support for anything that's not ports > > - delete/insert operations > > - proper handling for EPOLLHUP / EPOLLERR > > - command responses > > - headers, versioning, etc. > > Example: > > ./pasta --debug --config-net -c /tmp/pasta.conf -t none > > ./pesto /tmp/pasta.conf add 8889 > > Signed-off-by: Stefano Brivio Have some time to look at this, while Bass Strait glides past outside. For the most part, LGTM, as far as it goes. > --- > Makefile | 15 +++-- > conf.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++- > conf.h | 12 ++++ > epoll_type.h | 4 ++ > passt.c | 8 +++ > passt.h | 6 ++ > pesto.c | 130 ++++++++++++++++++++++++++++++++++++++++++ > 7 files changed, 328 insertions(+), 5 deletions(-) > create mode 100644 pesto.c > > diff --git a/Makefile b/Makefile > index 91e037b..a68ae6f 100644 > --- a/Makefile > +++ b/Makefile > @@ -44,7 +44,8 @@ PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c epoll_ctl.c \ > udp_flow.c udp_vu.c util.c vhost_user.c virtio.c vu_common.c > QRAP_SRCS = qrap.c > PASST_REPAIR_SRCS = passt-repair.c > -SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS) > +PESTO_SRCS = pesto.c > +SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS) $(PESTO_SRCS) > > MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1 > > @@ -74,9 +75,9 @@ mandir ?= $(datarootdir)/man > man1dir ?= $(mandir)/man1 > > ifeq ($(TARGET_ARCH),x86_64) > -BIN := passt passt.avx2 pasta pasta.avx2 qrap passt-repair > +BIN := passt passt.avx2 pasta pasta.avx2 qrap passt-repair pesto > else > -BIN := passt pasta qrap passt-repair > +BIN := passt pasta qrap passt-repair pesto > endif > > all: $(BIN) $(MANPAGES) docs > @@ -90,6 +91,9 @@ seccomp.h: seccomp.sh $(PASST_SRCS) $(PASST_HEADERS) > seccomp_repair.h: seccomp.sh $(PASST_REPAIR_SRCS) > @ ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp_repair.h $(PASST_REPAIR_SRCS) > > +seccomp_pesto.h: seccomp.sh $(PESTO_SRCS) > + @ ARCH="$(TARGET_ARCH)" CC="$(CC)" ./seccomp.sh seccomp_pesto.h $(PESTO_SRCS) > + > passt: $(PASST_SRCS) $(HEADERS) > $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_SRCS) -o passt $(LDFLAGS) > > @@ -109,6 +113,9 @@ qrap: $(QRAP_SRCS) passt.h > passt-repair: $(PASST_REPAIR_SRCS) seccomp_repair.h > $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS) > > +pesto: $(PESTO_SRCS) seccomp_pesto.h > + $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PESTO_SRCS) -o pesto $(LDFLAGS) > + > valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \ > rt_sigreturn getpid gettid kill clock_gettime \ > mmap|mmap2 munmap open unlink gettimeofday futex \ > @@ -118,7 +125,7 @@ valgrind: all > > .PHONY: clean > clean: > - $(RM) $(BIN) *~ *.o seccomp.h seccomp_repair.h pasta.1 \ > + $(RM) $(BIN) *~ *.o seccomp.h seccomp_repair.h seccomp_pesto.h pasta.1 \ > passt.tar passt.tar.gz *.deb *.rpm \ > passt.pid README.plain.md > > diff --git a/conf.c b/conf.c > index a8d0972..013512b 100644 > --- a/conf.c > +++ b/conf.c > @@ -47,6 +47,8 @@ > #include "isolation.h" > #include "log.h" > #include "vhost_user.h" > +#include "epoll_ctl.h" > +#include "conf.h" > > #define NETNS_RUN_DIR "/run/netns" > > @@ -1397,6 +1399,9 @@ static void conf_open_files(struct ctx *c) > c->fd_repair = -1; > } > > + if (c->fd_conf == -1) > + c->fd_conf_listen = sock_unix(c->conf_path); > + Nit: 'fd_conf' or 'conf_path' might suggest to someone a regular config file more than a config / control socket. We don't have a config file, of course, but it could still be misleading to someone new looking at it. > if (*c->pidfile) { > c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY); > if (c->pidfile_fd < 0) > @@ -1437,6 +1442,27 @@ fail: > die("Invalid MAC address: %s", str); > } > > +/** > + * conf_sock_init() - Start listening for connections on configuration socket > + * @c: Execution context > + */ > +static void conf_sock_init(const struct ctx *c) > +{ > + union epoll_ref ref = { .type = EPOLL_TYPE_CONF_LISTEN }; > + > + if (c->fd_conf_listen == -1) > + return; > + > + if (listen(c->fd_conf_listen, 0)) { > + err_perror("listen() on configuration socket"); > + return; > + } > + > + ref.fd = c->fd_conf_listen; > + if (epoll_add(c->epollfd, EPOLLIN | EPOLLHUP | EPOLLET, ref)) > + err("configuration socket epoll_ctl()"); > +} > + > /** > * conf() - Process command-line arguments and set configuration > * @c: Execution context > @@ -1519,9 +1545,10 @@ void conf(struct ctx *c, int argc, char **argv) > {"migrate-exit", no_argument, NULL, 29 }, > {"migrate-no-linger", no_argument, NULL, 30 }, > {"stats", required_argument, NULL, 31 }, > + {"conf-path", required_argument, NULL, 'c' }, > { 0 }, > }; > - const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:"; > + const char *optstring = "+dqfel:hs:c:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:"; > const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; > char userns[PATH_MAX] = { 0 }, netns[PATH_MAX] = { 0 }; > bool copy_addrs_opt = false, copy_routes_opt = false; > @@ -1780,6 +1807,13 @@ void conf(struct ctx *c, int argc, char **argv) > > c->fd_tap = -1; > break; > + case 'c': > + ret = snprintf(c->conf_path, sizeof(c->conf_path), "%s", > + optarg); > + if (ret <= 0 || ret >= (int)sizeof(c->sock_path)) > + die("Invalid configuration path: %s", optarg); > + c->fd_conf_listen = c->fd_conf = -1; > + break; > case 'F': > errno = 0; > fd_tap_opt = strtol(optarg, NULL, 0); > @@ -2207,4 +2241,126 @@ void conf(struct ctx *c, int argc, char **argv) > > if (!c->quiet) > conf_print(c); > + > + conf_sock_init(c); Nit: I'd suggest putting this before the conf_print() and including the socket path (when enabled) in the output from conf_print(). > +} > + > +/** > + * conf_listen_handler() - Handle events on configuration listening socket > + * @c: Execution context > + * @events: epoll events > + * > + * Return: 0 on valid event with new connected socket, error code on failure > + */ > +int conf_listen_handler(struct ctx *c, uint32_t events) > +{ > + union epoll_ref ref = { .type = EPOLL_TYPE_CONF }; > + struct ucred uc; > + socklen_t len; > + int rc; > + > + if (events != EPOLLIN) { > + debug("Spurious event 0x%04x on configuration socket", > + events); This is local user triggered, not guest or peer triggered, so I think it could safely be an err(). > + return EINVAL; > + } > + > + len = sizeof(uc); > + > + /* Another client is already connected: accept and close right away. */ > + if (c->fd_conf != -1) { > + int discard = accept4(c->fd_conf_listen, NULL, NULL, > + SOCK_NONBLOCK); > + > + if (discard == -1) > + return errno; > + > + if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &uc, &len)) > + info("Discarding configuration client, PID %i", uc.pid); > + > + close(discard); > + return EEXIST; > + } > + > + if ((c->fd_conf = accept4(c->fd_conf_listen, NULL, NULL, 0)) < 0) { > + rc = errno; > + debug_perror("accept4() on configuration listening socket"); > + return rc; > + } > + > + if (!getsockopt(c->fd_conf, SOL_SOCKET, SO_PEERCRED, &uc, &len)) > + info("Accepted configuration client, PID %i", uc.pid); Is displaying the client's PID actually going to be useful? By the time this happens, we'll have isolated our own pidns, but the client will likely be outside our pidns. The client's pid as translated into our pidns will therefore not really be meaningful (probably 0 or -1 or something). > + ref.fd = c->fd_conf; > + > + rc = epoll_add(c->epollfd, EPOLLHUP | EPOLLET, ref); EPOLLHUP is implicitly included. Surely we need EPOLLIN as well, no? > + if (rc < 0) { > + debug("epoll_ctl() on configuration socket"); > + close(c->fd_conf); > + c->fd_conf = -1; > + return rc; > + } > + > + return 0; > +} > + > +static struct conf_op conf_ops[128]; #define for buffer size? > +static int conf_ops_bytes_read; > + > +/** > + * conf_handler() - Handle events on configuration socket > + * @c: Execution context > + * @events: epoll events > + */ > +void conf_handler(struct ctx *c, uint32_t events) > +{ > + unsigned i; > + ssize_t n; > + > + n = recv(c->fd_conf, (char *)conf_ops + conf_ops_bytes_read, > + sizeof(conf_ops) - conf_ops_bytes_read, 0); > + if (n < 0) { > + if (errno != EAGAIN && errno != EWOULDBLOCK) { > + err_perror("Failed to read configuration data"); > + goto close; > + } > + return; > + } > + > + conf_ops_bytes_read += n; > + debug("Read %li bytes of configuration data (%i total)", > + n, conf_ops_bytes_read); > + > + if (!(events & EPOLLRDHUP) && !(events & EPOLLHUP)) > + return; Do we need to check for this explicitly? And end-of-stream should show up as a recv() returning <0, no? > + > + if (conf_ops_bytes_read % sizeof(conf_ops[0])) { > + err("Invalid data from configuration socket"); > + goto close; Since we're already assuming we get the config ops in aligned chunks, can we avoid the global buffer and read ops one at a time into a local buffer? > + } > + > +close: > + for (i = 0; i < conf_ops_bytes_read / sizeof(conf_ops[0]); i++) { > + struct fwd_rule *r = &conf_ops[i].r; This is probably already on your plan, but as with migration, I think we should explicitly translate from the control stream format, rather than embedding struct fwd_rule, so that we decouple the internal rule format from the control message format. Also like migration, I think we should make sure the control format is the same, regardless of platform specific padding, alignment, and endianness. > + > + debug("%s rule %i, ports %i-%i:%i", > + conf_ops[i].op == RULE_ADD ? "adding" : "deleting", i, > + r->first, r->last, r->to); > + > + if (conf_ops[i].op == RULE_ADD) { > + fwd_rule_add(&c->tcp.fwd_in, r->flags, &r->addr, > + /* r->ifname */ NULL, Why ignoring r->ifname? Also probably in your plan, but we want to update fwd_rule_add() to make errors non-fatal as a preliminary to this. > + r->first, r->last, r->to); > + } else { > + ; /* TODO */ > + } > + } > + > + fwd_listen_sync(c, &c->tcp.fwd_in, PIF_HOST, IPPROTO_TCP); > + > + debug("Closing configuration socket"); > + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_conf, NULL); > + close(c->fd_conf); Why always close() after a single batch of ops? > + conf_ops_bytes_read = 0; > + c->fd_conf = -1; > } > diff --git a/conf.h b/conf.h > index b45ad74..5dbdfa4 100644 > --- a/conf.h > +++ b/conf.h > @@ -8,5 +8,17 @@ > > enum passt_modes conf_mode(int argc, char *argv[]); > void conf(struct ctx *c, int argc, char **argv); > +int conf_listen_handler(struct ctx *c, uint32_t events); > +void conf_handler(struct ctx *c, uint32_t events); > + > +enum conf_rule_op { > + RULE_ADD, > + RULE_DEL, Bexause this is an external interface format, I think we should explicitly assign values to each opcode. > +}; > + > +struct conf_op { > + enum conf_rule_op op; As above, I think we should use a fixed width, fixed endianness field here, so we pin down the stream format to the byte level. > + struct fwd_rule r; > +}; > > #endif /* CONF_H */ > diff --git a/epoll_type.h b/epoll_type.h > index a90ffb6..061325a 100644 > --- a/epoll_type.h > +++ b/epoll_type.h > @@ -46,6 +46,10 @@ enum epoll_type { > EPOLL_TYPE_REPAIR, > /* Netlink neighbour subscription socket */ > EPOLL_TYPE_NL_NEIGH, > + /* Configuration listening socket */ > + EPOLL_TYPE_CONF_LISTEN, > + /* Configuration socket */ > + EPOLL_TYPE_CONF, > > EPOLL_NUM_TYPES, > }; > diff --git a/passt.c b/passt.c > index 7488a84..4b4dc21 100644 > --- a/passt.c > +++ b/passt.c > @@ -80,6 +80,8 @@ char *epoll_type_str[] = { > [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket", > [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket", > [EPOLL_TYPE_NL_NEIGH] = "netlink neighbour notifier socket", > + [EPOLL_TYPE_CONF_LISTEN] = "configuration listening socket", > + [EPOLL_TYPE_CONF] = "configuration socket", > }; > static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, > "epoll_type_str[] doesn't match enum epoll_type"); > @@ -303,6 +305,12 @@ static void passt_worker(void *opaque, int nfds, struct epoll_event *events) > case EPOLL_TYPE_NL_NEIGH: > nl_neigh_notify_handler(c); > break; > + case EPOLL_TYPE_CONF_LISTEN: > + conf_listen_handler(c, eventmask); > + break; > + case EPOLL_TYPE_CONF: > + conf_handler(c, eventmask); > + break; > default: > /* Can't happen */ > ASSERT(0); > diff --git a/passt.h b/passt.h > index 299185b..a3eab83 100644 > --- a/passt.h > +++ b/passt.h > @@ -158,6 +158,7 @@ struct ip6_ctx { > * @foreground: Run in foreground, don't log to stderr by default > * @nofile: Maximum number of open files (ulimit -n) > * @sock_path: Path for UNIX domain socket > + * @conf_path: Path for configuration UNIX domain socket > * @repair_path: TCP_REPAIR helper path, can be "none", empty for default > * @pcap: Path for packet capture file > * @pidfile: Path to PID file, empty string if not configured > @@ -169,6 +170,8 @@ struct ip6_ctx { > * @epollfd: File descriptor for epoll instance > * @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any > * @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket > + * @fd_conf_listen: Listening configuration socket, if any > + * @fd_conf: Configuration socket, if any > * @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any > * @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper > * @our_tap_mac: Pasta/passt's MAC on the tap link > @@ -222,6 +225,7 @@ struct ctx { > int foreground; > int nofile; > char sock_path[UNIX_PATH_MAX]; > + char conf_path[UNIX_PATH_MAX]; > char repair_path[UNIX_PATH_MAX]; > char pcap[PATH_MAX]; > > @@ -239,6 +243,8 @@ struct ctx { > int epollfd; > int fd_tap_listen; > int fd_tap; > + int fd_conf_listen; > + int fd_conf; > int fd_repair_listen; > int fd_repair; > unsigned char our_tap_mac[ETH_ALEN]; > diff --git a/pesto.c b/pesto.c > new file mode 100644 > index 0000000..64e8922 > --- /dev/null > +++ b/pesto.c > @@ -0,0 +1,130 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > + > +/* PESTO - Programmable Extensible Socket Translation Orchestrator > + * front-end for passt(1) and pasta(1) forwarding configuration > + * > + * pesto.c - Main program (it's not actually extensible) > + * > + * Copyright (c) 2026 Red Hat GmbH > + * Author: Stefano Brivio > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include > +#include > +#include > +#include > + > +#include "seccomp_pesto.h" > +#include "linux_dep.h" > +#include "epoll_type.h" > +#include "util.h" > +#include "passt.h" > +#include "fwd.h" > +#include "conf.h" > + > +/** > + * main() - Entry point and whole program with loop > + * @argc: Argument count > + * @argv: Arguments: socket path, operation, port specifiers > + * > + * Return: 0 on success, won't return on failure > + * > + * #syscalls:pesto connect write close exit_group > + * #syscalls:pesto socket s390x:socketcall i686:socketcall > + * #syscalls:pesto recvfrom recvmsg arm:recv ppc64le:recv > + * #syscalls:pesto sendto sendmsg arm:send ppc64le:send > + */ > +int main(int argc, char **argv) > +{ > + struct sockaddr_un a = { AF_UNIX, "" }; > + struct conf_op conf_ops[128]; > + int count = 0, ret, i, s; > + struct sock_fprog prog; > + > + prctl(PR_SET_DUMPABLE, 0); The rationale for this on the client is not obvious to me. > + prog.len = (unsigned short)sizeof(filter_pesto) / > + sizeof(filter_pesto[0]); > + prog.filter = filter_pesto; > + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || > + prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { > + fprintf(stderr, "Failed to apply seccomp filter\n"); > + _exit(1); The case for seccomp also seems weaker for this client (it's not exposed to guest or peer traffic).... > + } > + > + if (argc < 4) { > + fprintf(stderr, > + "Usage: %s PATH add|del SPEC [auto] [weak] \\" > + "[add|del SPEC [auto] [weak]] ...\n", > + argv[0]); IIUC, the code below allows multiple add/del on the same command line, which this usage() doesn't reflect. > + _exit(2); ... and especially the case for excluding malloc() (and the subsequent complications with exit() and various other things) seems pretty weak for this client. > + } > + > + if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { > + fprintf(stderr, "Failed to create AF_UNIX socket: %i\n", errno); > + _exit(1); > + } > + > + ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]); > + if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) { > + fprintf(stderr, "Invalid socket path\n"); > + _exit(2); > + } > + > + while (connect(s, (struct sockaddr *)&a, sizeof(a))) { > + if (errno == ECONNREFUSED) > + continue; Is the lack of timeout a "not yet done thing? Or a policy decision that the caller should implement the timeout if it needs one? > + fprintf(stderr, "Failed to connect to %s: %s\n", a.sun_path, > + strerror_(errno)); > + _exit(1); > + } > + > + for (i = 2; i + 2 <= argc; ) { > + struct fwd_rule *r = &conf_ops[count].r; > + > + if (!strcmp(argv[i], "add")) > + conf_ops[count].op = RULE_ADD; > + else if (!strcmp(argv[i], "del")) > + conf_ops[count].op = RULE_DEL; > + else > + _exit(1); > + i++; > + > + /* TODO: Factor relevant part out of conf_ports() and reuse */ > + r->to = r->last = r->first = atoi(argv[i]); > + i++; > + > + if (i < argc && !strcmp(argv[i], "auto")) { > + r->flags |= FWD_SCAN; > + i++; > + } > + > + if (i < argc && !strcmp(argv[i], "weak")) { > + r->flags |= FWD_WEAK; > + i++; > + } > + > + fprintf(stderr, "Read rule for ports %i-%i:%i\n", > + r->first, r->last, r->to); > + count++; Bounds check on count, but you knew that already. > + } > + > + send(s, conf_ops, sizeof(conf_ops[0]) * count, 0); > + > + return 0; > +} > -- > 2.43.0 > -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson