On Sat, Jan 13, 2024 at 11:51:05PM +0100, Stefano Brivio wrote: > On Thu, 21 Dec 2023 18:02:24 +1100 > David Gibson wrote: > > > tcp_tap_conn has several fields to track addresses and ports as seen by > > the guest/namespace. However we now have general fields for this in the > > common flowside struct. Use those instead of protocol specific fields. > > > > So far we've only explicitly tracked the guest side forwarding address > > in the TCP connection - the remote address from the guest's point of > > view. The tap side endpoint address - the local address from the > > guest's point of view - was assumed to always be one of the handful of > > guest addresses we track as addr_seen (one each for IPv4, IPv6 global > > and IPv6 link-local). > > > > struct flowside expects both addresses, and we will want to use the > > endpoint address in future. So, we need to add code to determine that > > address and record it in the flowside. > > > > Signed-off-by: David Gibson > > --- > > flow.h | 20 +++++++++++++++ > > tcp.c | 75 ++++++++++++++++++++++++++++-------------------------- > > tcp_conn.h | 8 ------ > > 3 files changed, 59 insertions(+), 44 deletions(-) > > > > diff --git a/flow.h b/flow.h > > index e090ba0..e7126e4 100644 > > --- a/flow.h > > +++ b/flow.h > > @@ -45,6 +45,26 @@ struct flowside { > > static_assert(_Alignof(struct flowside) == _Alignof(uint32_t), > > "Unexpected alignment for struct flowside"); > > > > +/** flowside_from_af - Initialize flowside from addresses > > flowside_from_af() - ...from addresses and ports? Yes. I couldn't think of a way of spelling it out more directly without making it very long. I hoped this was clear enough by way of analogy with inany_from_af(). > > + * @fside: flowside to initialize > > + * @pif: pif if of this flowside > > s/if/id/ or s/if/ID/. Fixed. > > + * @af: Address family (AF_INET or AF_INET6) > > + * @faddr: Forwarding address (pointer to in_addr or in6_addr) > > + * @fport: Forwarding port > > + * @eaddr: Endpoint address (pointer to in_addr or in6_addr) > > + * @eport: Endpoint port > > + */ > > +static inline void flowside_from_af(struct flowside *fside, uint8_t pif, int af, > > + const void *faddr, in_port_t fport, > > + const void *eaddr, in_port_t eport) > > +{ > > + fside->pif = pif; > > + inany_from_af(&fside->faddr, af, faddr); > > + inany_from_af(&fside->eaddr, af, eaddr); > > + fside->fport = fport; > > + fside->eport = eport; > > +} > > + > > /** flowside_complete - Check if flowside is fully initialized > > * @fside: flowside to check > > */ > > diff --git a/tcp.c b/tcp.c > > index ddabc34..7ef20b1 100644 > > --- a/tcp.c > > +++ b/tcp.c > > @@ -394,7 +394,9 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ > > #define OPT_SACK 5 > > #define OPT_TS 8 > > > > -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) > > +#define TAPFSIDE(conn) (&(conn)->f.side[TAPSIDE]) > > After reviewing the patch I understand what TAPFSIDE() is for... but > I still think the name is pretty obscure. I'm struggling to come up > with a better idea though. Perhaps FLOWSIDE_TAP(conn), or: > > #define FLOWSIDE(conn, _side) (&(conn)->f.side[(_side)]) > > ? > > Then sure, TAPFSIDE ans SOCKFSIDE have the advantage of brevity... but > for instance they don't spare any ugliness in tcp_tap_conn_from_sock() > compared to FLOWSIDE_TAP. Maybe they do in other places I didn't check. Right, the whole point of these macros is brevity - without it expressions we use it in become unreadably verbose. I think FLOWSIDE(conn, TAPSIDE) is too long. FLOWSIDE_TAP(conn) might be barely acceptable. > > + > > +#define CONN_V4(conn) (!!inany_v4(&TAPFSIDE(conn)->faddr)) > > #define CONN_V6(conn) (!CONN_V4(conn)) > > #define CONN_IS_CLOSING(conn) \ > > ((conn->events & ESTABLISHED) && \ > > @@ -845,7 +847,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) > > int i; > > > > for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) > > - if (inany_equals(&conn->faddr, low_rtt_dst + i)) > > + if (inany_equals(&TAPFSIDE(conn)->faddr, low_rtt_dst + i)) > > return 1; > > > > return 0; > > @@ -867,7 +869,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, > > return; > > > > for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { > > - if (inany_equals(&conn->faddr, low_rtt_dst + i)) > > + if (inany_equals(&TAPFSIDE(conn)->faddr, low_rtt_dst + i)) > > return; > > if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) > > hole = i; > > @@ -879,7 +881,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, > > if (hole == -1) > > return; > > > > - low_rtt_dst[hole++] = conn->faddr; > > + low_rtt_dst[hole++] = TAPFSIDE(conn)->faddr; > > if (hole == LOW_RTT_TABLE_SIZE) > > hole = 0; > > inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); > > @@ -1144,8 +1146,8 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn, > > const union inany_addr *faddr, > > in_port_t eport, in_port_t fport) > > { > > - if (inany_equals(&conn->faddr, faddr) && > > - conn->eport == eport && conn->fport == fport) > > + if (inany_equals(&TAPFSIDE(conn)->faddr, faddr) && > > + TAPFSIDE(conn)->eport == eport && TAPFSIDE(conn)->fport == fport) > > return 1; > > > > return 0; > > @@ -1179,7 +1181,8 @@ static uint64_t tcp_hash(const struct ctx *c, const union inany_addr *faddr, > > static uint64_t tcp_conn_hash(const struct ctx *c, > > const struct tcp_tap_conn *conn) > > { > > - return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); > > + return tcp_hash(c, &TAPFSIDE(conn)->faddr, > > + TAPFSIDE(conn)->eport, TAPFSIDE(conn)->fport); > > } > > > > /** > > @@ -1365,13 +1368,13 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, > > void *p, size_t plen, > > const uint16_t *check, uint32_t seq) > > { > > - const struct in_addr *a4 = inany_v4(&conn->faddr); > > + const struct in_addr *a4 = inany_v4(&TAPFSIDE(conn)->faddr); > > size_t ip_len, tlen; > > > > #define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \ > > do { \ > > - b->th.source = htons(conn->fport); \ > > - b->th.dest = htons(conn->eport); \ > > + b->th.source = htons(TAPFSIDE(conn)->fport); \ > > + b->th.dest = htons(TAPFSIDE(conn)->eport); \ > > b->th.seq = htonl(seq); \ > > b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ > > if (conn->events & ESTABLISHED) { \ > > @@ -1389,7 +1392,7 @@ do { \ > > ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); > > b->iph.tot_len = htons(ip_len); > > b->iph.saddr = a4->s_addr; > > - b->iph.daddr = c->ip4.addr_seen.s_addr; > > + b->iph.daddr = inany_v4(&TAPFSIDE(conn)->eaddr)->s_addr; > > > > if (check) > > b->iph.check = *check; > > @@ -1407,11 +1410,8 @@ do { \ > > ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); > > > > b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); > > - b->ip6h.saddr = conn->faddr.a6; > > - if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr)) > > - b->ip6h.daddr = c->ip6.addr_ll_seen; > > - else > > - b->ip6h.daddr = c->ip6.addr_seen; > > + b->ip6h.saddr = TAPFSIDE(conn)->faddr.a6; > > + b->ip6h.daddr = TAPFSIDE(conn)->eaddr.a6; > > As I was checking these assignments, it occurred to me that perhaps > laddr/raddr (local/remote), instead of faddr/eaddr, used in the sense of > the comment to struct flowside: > > * @eaddr: Endpoint address (remote address from passt's PoV) > * @faddr: Forwarding address (local address from passt's PoV) > > might be more obvious...? I'm not sure at this point. I don't think so. I introduced the endpoint/forwarding terminology specifically because local and remote often weren't clear. In this one case it might be ok, but there are lots of places where it's not necessarily obvious whether we're talking about local/remote w.r.t. passt or local/remote w.r.t. the guest. > > memset(b->ip6h.flow_lbl, 0, 3); > > > > @@ -1739,26 +1739,21 @@ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) > > /** > > * tcp_seq_init() - Calculate initial sequence number according to RFC 6528 > > * @c: Execution context > > - * @conn: TCP connection, with faddr, fport and eport populated > > + * @conn: TCP connection, with faddr, fport, eaddr, eport populated > > * @now: Current timestamp > > */ > > static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, > > const struct timespec *now) > > { > > struct siphash_state state = SIPHASH_INIT(c->hash_secret); > > - union inany_addr aany; > > uint64_t hash; > > uint32_t ns; > > > > - if (CONN_V4(conn)) > > - inany_from_af(&aany, AF_INET, &c->ip4.addr); > > - else > > - inany_from_af(&aany, AF_INET6, &c->ip6.addr); > > - > > - inany_siphash_feed(&state, &conn->faddr); > > - inany_siphash_feed(&state, &aany); > > + inany_siphash_feed(&state, &TAPFSIDE(conn)->faddr); > > + inany_siphash_feed(&state, &TAPFSIDE(conn)->eaddr); > > hash = siphash_final(&state, 36, > > - (uint64_t)conn->fport << 16 | conn->eport); > > + (uint64_t)TAPFSIDE(conn)->fport << 16 | > > + TAPFSIDE(conn)->eport); > > > > /* 32ns ticks, overflows 32 bits every 137s */ > > ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; > > @@ -1925,8 +1920,6 @@ static void tcp_conn_from_tap(struct ctx *c, > > socklen_t sl; > > int s, mss; > > > > - (void)saddr; > > - > > if (!(flow = flow_alloc())) > > return; > > > > @@ -1953,6 +1946,10 @@ static void tcp_conn_from_tap(struct ctx *c, > > > > conn = &flow->tcp; > > conn->f.type = FLOW_TCP; > > + flowside_from_af(TAPFSIDE(conn), PIF_TAP, af, > > + daddr, ntohs(th->dest), saddr, ntohs(th->source)); > > + ASSERT(flowside_complete(TAPFSIDE(conn))); > > + > > conn->sock = s; > > conn->timer = -1; > > conn_event(c, conn, TAP_SYN_RCVD); > > @@ -1972,8 +1969,6 @@ static void tcp_conn_from_tap(struct ctx *c, > > if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) > > conn->wnd_from_tap = 1; > > > > - inany_from_af(&conn->faddr, af, daddr); > > - > > if (af == AF_INET) { > > sa = (struct sockaddr *)&addr4; > > sl = sizeof(addr4); > > @@ -1982,9 +1977,6 @@ static void tcp_conn_from_tap(struct ctx *c, > > sl = sizeof(addr6); > > } > > > > - conn->fport = ntohs(th->dest); > > - conn->eport = ntohs(th->source); > > - > > conn->seq_init_from_tap = ntohl(th->seq); > > conn->seq_from_tap = conn->seq_init_from_tap + 1; > > conn->seq_ack_to_tap = conn->seq_from_tap; > > @@ -2674,10 +2666,21 @@ static void tcp_tap_conn_from_sock(struct ctx *c, > > conn->ws_to_tap = conn->ws_from_tap = 0; > > conn_event(c, conn, SOCK_ACCEPTED); > > > > - inany_from_sockaddr(&conn->faddr, &conn->fport, sa); > > - conn->eport = ref.port; > > + TAPFSIDE(conn)->pif = PIF_HOST; > > + inany_from_sockaddr(&TAPFSIDE(conn)->faddr, &TAPFSIDE(conn)->fport, sa); > > + tcp_snat_inbound(c, &TAPFSIDE(conn)->faddr); > > + > > + if (CONN_V4(conn)) { > > + inany_from_af(&TAPFSIDE(conn)->eaddr, AF_INET, &c->ip4.addr_seen); > > + } else { > > + if (IN6_IS_ADDR_LINKLOCAL(&TAPFSIDE(conn)->faddr.a6)) > > + TAPFSIDE(conn)->eaddr.a6 = c->ip6.addr_ll_seen; > > + else > > + TAPFSIDE(conn)->eaddr.a6 = c->ip6.addr_seen; > > + } > > + TAPFSIDE(conn)->eport = ref.port; > > > > - tcp_snat_inbound(c, &conn->faddr); > > + ASSERT(flowside_complete(TAPFSIDE(conn))); > > > > tcp_seq_init(c, conn, now); > > tcp_hash_insert(c, conn); > > diff --git a/tcp_conn.h b/tcp_conn.h > > index 1a4dcf2..8d9c7bd 100644 > > --- a/tcp_conn.h > > +++ b/tcp_conn.h > > @@ -23,9 +23,6 @@ > > * @ws_to_tap: Window scaling factor advertised to tap/guest > > * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS > > * @seq_dup_ack_approx: Last duplicate ACK number sent to tap > > - * @faddr: Guest side forwarding address (guest's remote address) > > - * @eport: Guest side endpoint port (guest's local port) > > - * @fport: Guest side forwarding port (guest's remote port) > > * @wnd_from_tap: Last window size from tap, unscaled (as received) > > * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) > > * @seq_to_tap: Next sequence for packets to tap > > @@ -91,11 +88,6 @@ struct tcp_tap_conn { > > > > uint8_t seq_dup_ack_approx; > > > > - > > - union inany_addr faddr; > > - in_port_t eport; > > - in_port_t fport; > > - > > uint16_t wnd_from_tap; > > uint16_t wnd_to_tap; > > > > The rest of this patch looks all good to me, but I'm still reviewing > this series, currently at 6/15. > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson