On Fri, Oct 14, 2022 at 08:36:24AM +0200, Stefano Brivio wrote: > Even if CAP_NET_BIND_SERVICE is granted, we'll lose the capability in > the target user namespace as we isolate the process, which means > we're unable to bind to low ports at that point. > > Bind inbound ports, and only those, before isolate_user(). Keep the > handling of outbound ports (for pasta mode only) after the setup of > the namespace, because that's where we'll bind them. > > To this end, initialise the netlink socket for the init namespace > before isolate_user() as well, as we actually need to know the > addresses of the upstream interface before binding ports, in case > they're not explicitly passed by the user. > > As we now call nl_sock_init() twice, checking its return code from > conf() twice looks a bit heavy: make it exit(), instead, as we > can't do much if we don't have netlink sockets. > > While at it: > > - move the v4_only && v6_only options check just after the first > option processing loop, as this is more strictly related to > option parsing proper > > - update the man page, explaining that CAP_NET_BIND_SERVICE is > *not* the preferred way to bind ports, because passt and pasta > can be abused to allow other processes to make effective usage > of it. Add a note about the recommended sysctl instead > > - simplify nl_sock_init_do() now that it's called once for each > case > > Reported-by: David Gibson > Signed-off-by: Stefano Brivio Reviewed-by: David Gibson > --- > v2: Simplify nl_sock_init_do(), add setcap for passt.avx2 in > man page (David Gibson) > > conf.c | 71 +++++++++++++++++++++++++++---------------------------- > netlink.c | 40 ++++++++++++++++--------------- > netlink.h | 2 +- > passt.1 | 47 ++++++++++++++++++++++++++++++------ > tap.c | 1 + > 5 files changed, 98 insertions(+), 63 deletions(-) > > diff --git a/conf.c b/conf.c > index a22ef48..e1f42f1 100644 > --- a/conf.c > +++ b/conf.c > @@ -1530,6 +1530,11 @@ void conf(struct ctx *c, int argc, char **argv) > } > } while (name != -1); > > + if (v4_only && v6_only) { > + err("Options ipv4-only and ipv6-only are mutually exclusive"); > + usage(argv[0]); > + } > + > ret = conf_ugid(runas, &uid, &gid); > if (ret) > usage(argv[0]); > @@ -1539,6 +1544,30 @@ void conf(struct ctx *c, int argc, char **argv) > logfile, logsize); > } > > + nl_sock_init(c, false); > + if (!v6_only) > + c->ifi4 = conf_ip4(ifi, &c->ip4, c->mac); > + if (!v4_only) > + c->ifi6 = conf_ip6(ifi, &c->ip6, c->mac); > + if (!c->ifi4 && !c->ifi6) { > + err("External interface not usable"); > + exit(EXIT_FAILURE); > + } > + > + /* Inbound port options can be parsed now (after IPv4/IPv6 settings) */ > + optind = 1; > + do { > + struct port_fwd *fwd; > + > + name = getopt_long(argc, argv, optstring, options, NULL); > + > + if ((name == 't' && (fwd = &c->tcp.fwd_in)) || > + (name == 'u' && (fwd = &c->udp.fwd_in.f))) { > + if (!optarg || conf_ports(c, name, optarg, fwd)) > + usage(argv[0]); > + } > + } while (name != -1); > + > if (c->mode == MODE_PASTA) { > if (conf_pasta_ns(&netns_only, userns, netns, > optind, argc, argv) < 0) > @@ -1561,50 +1590,20 @@ void conf(struct ctx *c, int argc, char **argv) > } > } > > - if (nl_sock_init(c)) { > - err("Failed to get netlink socket"); > - exit(EXIT_FAILURE); > - } > - > - if (v4_only && v6_only) { > - err("Options ipv4-only and ipv6-only are mutually exclusive"); > - usage(argv[0]); > - } > - if (!v6_only) > - c->ifi4 = conf_ip4(ifi, &c->ip4, c->mac); > - if (!v4_only) > - c->ifi6 = conf_ip6(ifi, &c->ip6, c->mac); > - if (!c->ifi4 && !c->ifi6) { > - err("External interface not usable"); > - exit(EXIT_FAILURE); > - } > + if (c->mode == MODE_PASTA) > + nl_sock_init(c, true); > > - /* Now we can process port configuration options */ > + /* ...and outbound port options now that namespaces are set up. */ > optind = 1; > do { > - struct port_fwd *fwd = NULL; > + struct port_fwd *fwd; > > name = getopt_long(argc, argv, optstring, options, NULL); > - switch (name) { > - case 't': > - case 'u': > - case 'T': > - case 'U': > - if (name == 't') > - fwd = &c->tcp.fwd_in; > - else if (name == 'T') > - fwd = &c->tcp.fwd_out; > - else if (name == 'u') > - fwd = &c->udp.fwd_in.f; > - else if (name == 'U') > - fwd = &c->udp.fwd_out.f; > > + if ((name == 'T' && (fwd = &c->tcp.fwd_out)) || > + (name == 'U' && (fwd = &c->udp.fwd_out.f))) { > if (!optarg || conf_ports(c, name, optarg, fwd)) > usage(argv[0]); > - > - break; > - default: > - break; > } > } while (name != -1); > > diff --git a/netlink.c b/netlink.c > index 6e5a96b..0850cbe 100644 > --- a/netlink.c > +++ b/netlink.c > @@ -19,6 +19,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -39,57 +40,58 @@ static int nl_sock_ns = -1; > static int nl_seq; > > /** > - * nl_sock_init_do() - Set up netlink sockets in init and target namespace > - * @arg: Execution context > + * nl_sock_init_do() - Set up netlink sockets in init or target namespace > + * @arg: Execution context, if running from namespace, NULL otherwise > * > * Return: 0 > */ > static int nl_sock_init_do(void *arg) > { > struct sockaddr_nl addr = { .nl_family = AF_NETLINK, }; > - int *s = &nl_sock; > + int *s = arg ? &nl_sock_ns : &nl_sock; > #ifdef NETLINK_GET_STRICT_CHK > int y = 1; > #endif > > -ns: > + if (arg) > + ns_enter((struct ctx *)arg); > + > if (((*s) = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)) < 0 || > - bind(*s, (struct sockaddr *)&addr, sizeof(addr))) > + bind(*s, (struct sockaddr *)&addr, sizeof(addr))) { > *s = -1; > - > - if (*s == -1 || !arg || s == &nl_sock_ns) > return 0; > + } > > #ifdef NETLINK_GET_STRICT_CHK > if (setsockopt(*s, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &y, sizeof(y))) > debug("netlink: cannot set NETLINK_GET_STRICT_CHK on %i", *s); > #endif > - > - ns_enter((struct ctx *)arg); > - s = &nl_sock_ns; > - goto ns; > + return 0; > } > > /** > - * nl_sock_init() - Call nl_sock_init_do() and check for failures > + * nl_sock_init() - Call nl_sock_init_do(), won't return on failure > * @c: Execution context > - * > - * Return: -EIO if sockets couldn't be set up, 0 otherwise > + * @ns: Get socket in namespace, not in init > */ > -int nl_sock_init(const struct ctx *c) > +void nl_sock_init(const struct ctx *c, bool ns) > { > - if (c->mode == MODE_PASTA) { > + if (ns) { > NS_CALL(nl_sock_init_do, c); > if (nl_sock_ns == -1) > - return -EIO; > + goto fail; > } else { > nl_sock_init_do(NULL); > } > > if (nl_sock == -1) > - return -EIO; > + goto fail; > > - return 0; > + return; > + > +fail: > + err("Failed to get netlink socket"); > + exit(EXIT_FAILURE); > } > > /** > diff --git a/netlink.h b/netlink.h > index 5ce5037..3c991cc 100644 > --- a/netlink.h > +++ b/netlink.h > @@ -6,7 +6,7 @@ > #ifndef NETLINK_H > #define NETLINK_H > > -int nl_sock_init(const struct ctx *c); > +void nl_sock_init(const struct ctx *c, bool ns); > unsigned int nl_get_ext_if(sa_family_t af); > void nl_route(int ns, unsigned int ifi, sa_family_t af, void *gw); > void nl_addr(int ns, unsigned int ifi, sa_family_t af, > diff --git a/passt.1 b/passt.1 > index 7d113f2..4848087 100644 > --- a/passt.1 > +++ b/passt.1 > @@ -771,14 +771,47 @@ possible to bind sockets to foreign addresses. > > .SS Binding to low numbered ports (well-known or system ports, up to 1023) > > -If the port forwarding configuration requires binding to port numbers lower than > -1024, \fBpasst\fR and \fBpasta\fR will try to bind to them, but will fail if not > -running as root, or without the \fICAP_NET_BIND_SERVICE\fR Linux capability, see > -\fBservices\fR(5) and \fBcapabilities\fR(7). To grant the > -\fICAP_NET_BIND_SERVICE\fR capability to passt, you can issue, as root: > +If the port forwarding configuration requires binding to ports with numbers > +lower than 1024, \fBpasst\fR and \fBpasta\fR will try to bind to them, but will > +fail, unless, either: > + > +.IP \(bu 2 > +the \fIsys.net.ipv4.ip_unprivileged_port_start\fR sysctl is set to the number > +of the lowest port \fBpasst\fR and \fBpasta\fR need. For example, as root: > + > +.nf > + sysctl -w net.ipv4.ip_unprivileged_port_start=443 > +.fi > + > +\fBNote\fR: this is the recommended way of enabling \fBpasst\fR and \fBpasta\fR > +to bind to ports with numbers below 1024. > + > +.IP \(bu > +or the \fICAP_NET_BIND_SERVICE\fR Linux capability is granted, see > +\fBservices\fR(5) and \fBcapabilities\fR(7). > + > +This is, in general, \fBnot the recommended way\fR, because \fBpasst\fR and > +\fBpasta\fR might be used as vector to effectively use this capability from > +another process. > + > +However, if your environment is sufficiently controlled by an LSM (Linux > +Security Module) such as \fIAppArmor\fR, \fISELinux\fR, \fISmack\fR or > +\fITOMOYO\fR, and no other processes can interact in such a way in virtue of > +this, granting this capability to \fBpasst\fR and \fBpasta\fR only can > +effectively prevent other processes from utilising it. > + > +Note that this will not work for automatic detection and forwarding of ports > +with \fBpasta\fR, because \fBpasta\fR will relinquish this capability at > +runtime. > + > +To grant this capability, you can issue, as root: > + > +.nf > + for p in $(which passt passt.avx2); do > + setcap 'cap_net_bind_service=+ep' "${p}" > + done > +.fi > > -.RS > -setcap 'cap_net_bind_service=+ep' $(which passt) > .RE > > .SS ICMP/ICMPv6 Echo sockets > diff --git a/tap.c b/tap.c > index 77e513c..8b6d9bc 100644 > --- a/tap.c > +++ b/tap.c > @@ -30,6 +30,7 @@ > #include > #include > #include > +#include > #include > #include > #include -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson