On Tue, Oct 11, 2022 at 01:33:49AM +0200, Stefano Brivio wrote: > It's indeed convenient to use the final destination port in the epoll > reference data, so that we don't have to check the configured port > deltas before establishing connections. However, this doesn't work if > we need to access the original port information to know what to do > once we receive a connection. > > The existing implementation resulted in inbound, spliced connections > with a remapped destination port (and only those) to be attempted on > the outbound side, as we would check the wrong position in port > bitmaps to establish whether to use inbound or outbound sockets. > > For listening spliced sockets, set the port index in the epoll > reference to the original port. Once we get a connection, use the > port delta arrays to find the final destination port, and the > original destination port to decide what socket pool we should use. > > This avoids the need for a reverse mapping table as implemented for > UDP. > > Fixes: 33482d5bf293 ("passt: Add PASTA mode, major rework") > Signed-off-by: Stefano Brivio > --- > tcp.c | 19 ++++++++----------- > tcp_splice.c | 18 ++++++++++++++---- > 2 files changed, 22 insertions(+), 15 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 63153b6..cc08353 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -3084,15 +3084,16 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events, > void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, > const void *addr, const char *ifname, in_port_t port) > { > + union tcp_epoll_ref tref_spliced = { .tcp.listen = 1, .tcp.splice = 1 }; > union tcp_epoll_ref tref = { .tcp.listen = 1 }; > const void *bind_addr; > int s; > > - if (ns) { > + if (ns) > tref.tcp.index = (in_port_t)(port + c->tcp.fwd_out.delta[port]); > - } else { > + else > tref.tcp.index = (in_port_t)(port + c->tcp.fwd_in.delta[port]); > - } > + tref_spliced.tcp.index = (in_port_t) port; > > if (af == AF_INET || af == AF_UNSPEC) { > if (!addr && c->mode == MODE_PASTA) > @@ -3100,8 +3101,7 @@ void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, > else > bind_addr = addr; > > - tref.tcp.v6 = 0; > - tref.tcp.splice = 0; > + tref.tcp.v6 = tref_spliced.tcp.v6 = 0; > > if (!ns) { > s = sock_l4(c, AF_INET, IPPROTO_TCP, bind_addr, ifname, > @@ -3118,9 +3118,8 @@ void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, > if (c->mode == MODE_PASTA) { > bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) }; > > - tref.tcp.splice = 1; > s = sock_l4(c, AF_INET, IPPROTO_TCP, bind_addr, ifname, > - port, tref.u32); > + port, tref_spliced.u32); > if (s >= 0) > tcp_sock_set_bufsize(c, s); > else > @@ -3141,9 +3140,8 @@ void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, > else > bind_addr = addr; > > - tref.tcp.v6 = 1; > + tref.tcp.v6 = tref_spliced.tcp.v6 = 1; > > - tref.tcp.splice = 0; > if (!ns) { > s = sock_l4(c, AF_INET6, IPPROTO_TCP, bind_addr, ifname, > port, tref.u32); > @@ -3159,9 +3157,8 @@ void tcp_sock_init(const struct ctx *c, int ns, sa_family_t af, > if (c->mode == MODE_PASTA) { > bind_addr = &in6addr_loopback; > > - tref.tcp.splice = 1; > s = sock_l4(c, AF_INET6, IPPROTO_TCP, bind_addr, ifname, > - port, tref.u32); > + port, tref_spliced.u32); > if (s >= 0) > tcp_sock_set_bufsize(c, s); > else > diff --git a/tcp_splice.c b/tcp_splice.c > index 96c31c8..bf5f62c 100644 > --- a/tcp_splice.c > +++ b/tcp_splice.c > @@ -35,6 +35,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -512,13 +513,18 @@ static int tcp_splice_connect_ns(void *arg) > static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, > in_port_t port) > { > - struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 }; > int *p, i, s = -1; > + bool inbound; > > - if (bitmap_isset(c->tcp.fwd_in.map, port)) > + if (bitmap_isset(c->tcp.fwd_in.map, port)) { > p = CONN_V6(conn) ? ns_sock_pool6 : ns_sock_pool4; > - else > + port += c->tcp.fwd_in.delta[port]; > + inbound = true; > + } else { > p = CONN_V6(conn) ? init_sock_pool6 : init_sock_pool4; > + port += c->tcp.fwd_out.delta[port]; > + inbound = false; > + } This smells wrong to me. AFAICT there's nothing inherently wrong with forwarding inbound connections to port 5015 to port 5016 in the namespace *and* forwarding outbound connections to port 5015 to port 5016 on the host. So I think rather than consulting the port map here, each conn object should know if its an inward or outward connection. To make that work with epoll, we'd also need a bit in the ref which says whether it's a socket listening on the host or in the ns. > > for (i = 0; i < TCP_SOCK_POOL_SIZE; i++, p++) { > SWAP(s, *p); > @@ -526,11 +532,15 @@ static int tcp_splice_new(const struct ctx *c, struct tcp_splice_conn *conn, > break; > } > > - if (s < 0 && bitmap_isset(c->tcp.fwd_in.map, port)) { > + /* No socket available in namespace: create a new one for connect() */ > + if (s < 0 && inbound) { > + struct tcp_splice_connect_ns_arg ns_arg = { c, conn, port, 0 }; > + > NS_CALL(tcp_splice_connect_ns, &ns_arg); > return ns_arg.ret; > } > > + /* Otherwise, the socket will connect on the side it was created on */ > return tcp_splice_connect(c, conn, s, port); > } > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson