On Thu, Sep 07, 2023 at 03:02:53AM +0200, Stefano Brivio wrote: > On Mon, 28 Aug 2023 15:41:46 +1000 > David Gibson wrote: > > > Every flow in the flow table now has space for the the addresses as seen by > > both the host and guest side. We fill that information in for regular > > "tap" TCP connections, but not for spliced connections. > > > > Fill in that information for spliced connections too, so it's now uniformly > > available for all flow types (that we've implemented so far). > > > > Signed-off-by: David Gibson > > --- > > tcp.c | 46 +++++++++++++++++++--------------------------- > > tcp_splice.c | 40 ++++++++++++++++++++++++++-------------- > > tcp_splice.h | 3 +-- > > 3 files changed, 46 insertions(+), 43 deletions(-) > > > > diff --git a/tcp.c b/tcp.c > > index 297134f..7459fc2 100644 > > --- a/tcp.c > > +++ b/tcp.c > > @@ -2639,37 +2639,25 @@ static void tcp_snat_inbound(const struct ctx *c, union inany_addr *addr) > > * tcp_tap_conn_from_sock() - Initialize state for non-spliced connection > > * @c: Execution context > > * @ref: epoll reference of listening socket > > - * @conn: connection structure to initialize > > + * @conn: connection structure (with TAPSIDE(@conn) completed) > > * @s: Accepted socket > > - * @sa: Peer socket address (from accept()) > > * @now: Current timestamp > > - * > > - * Return: true if able to create a tap connection, false otherwise > > */ > > -static bool tcp_tap_conn_from_sock(struct ctx *c, > > +static void tcp_tap_conn_from_sock(struct ctx *c, > > union tcp_listen_epoll_ref ref, > > struct tcp_tap_conn *conn, int s, > > - struct sockaddr *sa, > > const struct timespec *now) > > { > > char fsstr[FLOWSIDE_STRLEN]; > > > > + ASSERT(flowside_complete(SOCKSIDE(conn))); > > + > > conn->f.type = FLOW_TCP; > > conn->sock = s; > > conn->timer = -1; > > conn->ws_to_tap = conn->ws_from_tap = 0; > > conn_event(c, conn, SOCK_ACCEPTED); > > > > - if (flowside_getsockname(SOCKSIDE(conn), s) < 0) { > > - err("tcp: Failed to get local name, connection dropped"); > > - return false; > > - } > > - inany_from_sockaddr(&SOCKSIDE(conn)->eaddr, &SOCKSIDE(conn)->eport, sa); > > - > > - ASSERT(flowside_complete(SOCKSIDE(conn))); > > - debug("TCP: index %li, new connection from socket, %s", FLOW_IDX(conn), > > - flowside_fmt(SOCKSIDE(conn), fsstr, sizeof(fsstr))); > > - > > TAPSIDE(conn)->faddr = SOCKSIDE(conn)->eaddr; > > TAPSIDE(conn)->fport = SOCKSIDE(conn)->eport; > > tcp_snat_inbound(c, &TAPSIDE(conn)->faddr); > > @@ -2699,8 +2687,6 @@ static bool tcp_tap_conn_from_sock(struct ctx *c, > > conn_flag(c, conn, ACK_FROM_TAP_DUE); > > > > tcp_get_sndbuf(conn); > > - > > - return true; > > } > > > > /** > > @@ -2712,6 +2698,7 @@ static bool tcp_tap_conn_from_sock(struct ctx *c, > > void tcp_listen_handler(struct ctx *c, union epoll_ref ref, > > const struct timespec *now) > > { > > + char fsstr[FLOWSIDE_STRLEN]; > > struct sockaddr_storage sa; > > union flow *flow; > > socklen_t sl; > > @@ -2730,20 +2717,25 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, > > if (s < 0) > > return; > > > > - flow = flowtab + c->flow_count++; > > + flow = flowtab + c->flow_count; > > > > - if (c->mode == MODE_PASTA && > > - tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, > > - s, (struct sockaddr *)&sa)) > > + if (flowside_getsockname(&flow->f.side[0], s) < 0) { > > + err("tcp: Failed to get local name, connection dropped"); > > + close(s); > > return; > > + } > > + inany_from_sockaddr(&flow->f.side[0].eaddr, &flow->f.side[0].eport, > > + &sa); > > + c->flow_count++; > > > > - if (tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, > > - (struct sockaddr *)&sa, now)) > > + debug("TCP: index %li, new connection from socket, %s", FLOW_IDX(flow), > > + flowside_fmt(&flow->f.side[0], fsstr, sizeof(fsstr))); > > + > > + if (c->mode == MODE_PASTA && > > + tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, s)) > > return; > > > > - /* Failed to create the connection */ > > - close(s); > > - c->flow_count--; > > + tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, now); > > } > > > > /** > > diff --git a/tcp_splice.c b/tcp_splice.c > > index 676e7e8..018d095 100644 > > --- a/tcp_splice.c > > +++ b/tcp_splice.c > > @@ -73,6 +73,9 @@ static int ns_sock_pool6 [TCP_SOCK_POOL_SIZE]; > > /* Pool of pre-opened pipes */ > > static int splice_pipe_pool [TCP_SPLICE_PIPE_POOL_SIZE][2][2]; > > > > +#define ASIDE(conn) (&(conn)->f.side[0]) > > +#define BSIDE(conn) (&(conn)->f.side[1]) > > + > > #define CONN_V6(x) (x->flags & SPLICE_V6) > > #define CONN_V4(x) (!CONN_V6(x)) > > #define CONN_HAS(conn, set) ((conn->events & (set)) == (set)) > > @@ -310,7 +313,16 @@ void tcp_splice_destroy(struct ctx *c, union flow *flow) > > static int tcp_splice_connect_finish(const struct ctx *c, > > struct tcp_splice_conn *conn) > > { > > - int i; > > + char fsstr[FLOWSIDE_STRLEN]; > > + int i, rc; > > + > > + rc = flowside_getsockname(BSIDE(conn), conn->b); > > + if (rc) > > + return rc; > > + > > + ASSERT(flowside_complete(BSIDE(conn))); > > + debug("TCP (splice): index %li, connection forwarded, %s", FLOW_IDX(conn), > > + flowside_fmt(BSIDE(conn), fsstr, sizeof(fsstr))); > > > > conn->pipe_a_b[0] = conn->pipe_b_a[0] = -1; > > conn->pipe_a_b[1] = conn->pipe_b_a[1] = -1; > > @@ -386,10 +398,13 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, > > if (CONN_V6(conn)) { > > sa = (struct sockaddr *)&addr6; > > sl = sizeof(addr6); > > + inany_from_af(&BSIDE(conn)->eaddr, AF_INET6, &addr6.sin6_addr); > > } else { > > sa = (struct sockaddr *)&addr4; > > sl = sizeof(addr4); > > + inany_from_af(&BSIDE(conn)->eaddr, AF_INET, &addr4.sin_addr); > > } > > + BSIDE(conn)->eport = port; > > > > if (connect(conn->b, sa, sl)) { > > if (errno != EINPROGRESS) { > > @@ -480,33 +495,30 @@ static void tcp_splice_dir(struct tcp_splice_conn *conn, int ref_sock, > > * tcp_splice_conn_from_sock() - Attempt to init state for a spliced connection > > * @c: Execution context > > * @ref: epoll reference of listening socket > > - * @conn: connection structure to initialize > > + * @conn: connection structure (with ASIDE(@conn) completed) > > * @s: Accepted socket > > - * @sa: Peer address of connection > > * > > * Return: true if able to create a spliced connection, false otherwise > > * #syscalls:pasta setsockopt > > */ > > bool tcp_splice_conn_from_sock(struct ctx *c, union tcp_listen_epoll_ref ref, > > - struct tcp_splice_conn *conn, int s, > > - const struct sockaddr *sa) > > + struct tcp_splice_conn *conn, int s) > > { > > - const struct in_addr *a4; > > - union inany_addr aany; > > - in_port_t port; > > + const struct in_addr *e4 = inany_v4(&ASIDE(conn)->eaddr); > > + const struct in_addr *f4 = inany_v4(&ASIDE(conn)->faddr); > > > > ASSERT(c->mode == MODE_PASTA); > > + ASSERT(flowside_complete(ASIDE(conn))); > > > > - inany_from_sockaddr(&aany, &port, sa); > > - a4 = inany_v4(&aany); > > - > > - if (a4) { > > - if (!IN4_IS_ADDR_LOOPBACK(a4)) > > + if (e4) { > > + if (!IN4_IS_ADDR_LOOPBACK(e4)) > > return false; > > + ASSERT(f4 && IN4_IS_ADDR_LOOPBACK(f4)); > > I can't follow this: the test you're replacing is actually (still) a > test used by tcp_listen_handler() unless I'm missing something. I'm not replacing a test - I'm just rewriting the same test, then adding an assert. > Returning false here should simply mean we can't use a spliced > connection, not that something is wrong. Yes. The 'if' checks if this is a spliceable connection - we have a loopback endpoint (remote to pasta) address. The assert is then checking that the forwarding address (local to pasta) is also loopback - i.e. that we don't have traffic that's going from 127.0.0.1 to a non-loopback address which would be nonsense. I guess this does indicate that the kernel has given us something weird, rather than an internal bug, so maybe it should be log an error and drop the connection rather than asserting. > > > conn->flags = 0; > > } else { > > - if (!IN6_IS_ADDR_LOOPBACK(&aany.a6)) > > + if (!IN6_IS_ADDR_LOOPBACK(&ASIDE(conn)->eaddr.a6)) > > return false; > > + ASSERT(IN6_IS_ADDR_LOOPBACK(&ASIDE(conn)->faddr.a6)); > > ...same here. > > Everything else in the series looks good to me! It looks simpler (so > far) than I thought it would be. > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson