From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from gandalf.ozlabs.org (mail.ozlabs.org [IPv6:2404:9400:2221:ea00::3]) by passt.top (Postfix) with ESMTPS id 89ABF5A0275 for ; Thu, 21 Dec 2023 08:02:43 +0100 (CET) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gibson.dropbear.id.au; s=202312; t=1703142160; bh=5GcGt/RCKH0Hg6gmNEU5PmH29hAC74687tv2zYpfymA=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=QKieNCi4Z29YJklIW/tqsv2a/vKOIMcDiD2qFdGNmav0EfMh45i3oNoBnA0nMD3D5 BA4xzSQQH+my1BcqUYJ9Y5lRu+InVsCZnF2fgdsla4PDRnJyXVn1gCAWMbcH6SWCWQ HTVs5hAaQA1l9SD3gnoTr+HHPrQQ5uHmmB0GH32PaM159amUBFUhYejpeZYBZrFn1v FBojQSwiX1WzDc31FgfIaV1JbgmcZ11aEyMz2n5cUIOWvfuBf+W57ZKnnbgsNWzdYK 4oAl2OWc3jFCOlEbmRMtGDM8aTB5AK578uXZtsuraT8+ZptLtz0CBIGL36mXDxsfaO P82z9WFiXcdSA== Received: by gandalf.ozlabs.org (Postfix, from userid 1007) id 4SwhFr6qb1z4xCp; Thu, 21 Dec 2023 18:02:40 +1100 (AEDT) From: David Gibson To: Stefano Brivio , passt-dev@passt.top Subject: [PATCH v3 02/15] tcp, flow: Maintain guest side flow information Date: Thu, 21 Dec 2023 18:02:24 +1100 Message-ID: <20231221070237.1422557-3-david@gibson.dropbear.id.au> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20231221070237.1422557-1-david@gibson.dropbear.id.au> References: <20231221070237.1422557-1-david@gibson.dropbear.id.au> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Message-ID-Hash: PAHQT3YPKMGRTVKCMFYTA4JOYGCUPK2H X-Message-ID-Hash: PAHQT3YPKMGRTVKCMFYTA4JOYGCUPK2H X-MailFrom: dgibson@gandalf.ozlabs.org X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; emergency; loop; banned-address; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header CC: David Gibson X-Mailman-Version: 3.3.8 Precedence: list List-Id: Development discussion and patches for passt Archived-At: Archived-At: List-Archive: List-Archive: List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: tcp_tap_conn has several fields to track addresses and ports as seen by the guest/namespace. However we now have general fields for this in the common flowside struct. Use those instead of protocol specific fields. So far we've only explicitly tracked the guest side forwarding address in the TCP connection - the remote address from the guest's point of view. The tap side endpoint address - the local address from the guest's point of view - was assumed to always be one of the handful of guest addresses we track as addr_seen (one each for IPv4, IPv6 global and IPv6 link-local). struct flowside expects both addresses, and we will want to use the endpoint address in future. So, we need to add code to determine that address and record it in the flowside. Signed-off-by: David Gibson --- flow.h | 20 +++++++++++++++ tcp.c | 75 ++++++++++++++++++++++++++++-------------------------- tcp_conn.h | 8 ------ 3 files changed, 59 insertions(+), 44 deletions(-) diff --git a/flow.h b/flow.h index e090ba0..e7126e4 100644 --- a/flow.h +++ b/flow.h @@ -45,6 +45,26 @@ struct flowside { static_assert(_Alignof(struct flowside) == _Alignof(uint32_t), "Unexpected alignment for struct flowside"); +/** flowside_from_af - Initialize flowside from addresses + * @fside: flowside to initialize + * @pif: pif if of this flowside + * @af: Address family (AF_INET or AF_INET6) + * @faddr: Forwarding address (pointer to in_addr or in6_addr) + * @fport: Forwarding port + * @eaddr: Endpoint address (pointer to in_addr or in6_addr) + * @eport: Endpoint port + */ +static inline void flowside_from_af(struct flowside *fside, uint8_t pif, int af, + const void *faddr, in_port_t fport, + const void *eaddr, in_port_t eport) +{ + fside->pif = pif; + inany_from_af(&fside->faddr, af, faddr); + inany_from_af(&fside->eaddr, af, eaddr); + fside->fport = fport; + fside->eport = eport; +} + /** flowside_complete - Check if flowside is fully initialized * @fside: flowside to check */ diff --git a/tcp.c b/tcp.c index ddabc34..7ef20b1 100644 --- a/tcp.c +++ b/tcp.c @@ -394,7 +394,9 @@ struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ #define OPT_SACK 5 #define OPT_TS 8 -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define TAPFSIDE(conn) (&(conn)->f.side[TAPSIDE]) + +#define CONN_V4(conn) (!!inany_v4(&TAPFSIDE(conn)->faddr)) #define CONN_V6(conn) (!CONN_V4(conn)) #define CONN_IS_CLOSING(conn) \ ((conn->events & ESTABLISHED) && \ @@ -845,7 +847,7 @@ static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn) int i; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) - if (inany_equals(&conn->faddr, low_rtt_dst + i)) + if (inany_equals(&TAPFSIDE(conn)->faddr, low_rtt_dst + i)) return 1; return 0; @@ -867,7 +869,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, return; for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) { - if (inany_equals(&conn->faddr, low_rtt_dst + i)) + if (inany_equals(&TAPFSIDE(conn)->faddr, low_rtt_dst + i)) return; if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i)) hole = i; @@ -879,7 +881,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn, if (hole == -1) return; - low_rtt_dst[hole++] = conn->faddr; + low_rtt_dst[hole++] = TAPFSIDE(conn)->faddr; if (hole == LOW_RTT_TABLE_SIZE) hole = 0; inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any); @@ -1144,8 +1146,8 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn, const union inany_addr *faddr, in_port_t eport, in_port_t fport) { - if (inany_equals(&conn->faddr, faddr) && - conn->eport == eport && conn->fport == fport) + if (inany_equals(&TAPFSIDE(conn)->faddr, faddr) && + TAPFSIDE(conn)->eport == eport && TAPFSIDE(conn)->fport == fport) return 1; return 0; @@ -1179,7 +1181,8 @@ static uint64_t tcp_hash(const struct ctx *c, const union inany_addr *faddr, static uint64_t tcp_conn_hash(const struct ctx *c, const struct tcp_tap_conn *conn) { - return tcp_hash(c, &conn->faddr, conn->eport, conn->fport); + return tcp_hash(c, &TAPFSIDE(conn)->faddr, + TAPFSIDE(conn)->eport, TAPFSIDE(conn)->fport); } /** @@ -1365,13 +1368,13 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c, void *p, size_t plen, const uint16_t *check, uint32_t seq) { - const struct in_addr *a4 = inany_v4(&conn->faddr); + const struct in_addr *a4 = inany_v4(&TAPFSIDE(conn)->faddr); size_t ip_len, tlen; #define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \ do { \ - b->th.source = htons(conn->fport); \ - b->th.dest = htons(conn->eport); \ + b->th.source = htons(TAPFSIDE(conn)->fport); \ + b->th.dest = htons(TAPFSIDE(conn)->eport); \ b->th.seq = htonl(seq); \ b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ if (conn->events & ESTABLISHED) { \ @@ -1389,7 +1392,7 @@ do { \ ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); b->iph.tot_len = htons(ip_len); b->iph.saddr = a4->s_addr; - b->iph.daddr = c->ip4.addr_seen.s_addr; + b->iph.daddr = inany_v4(&TAPFSIDE(conn)->eaddr)->s_addr; if (check) b->iph.check = *check; @@ -1407,11 +1410,8 @@ do { \ ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); - b->ip6h.saddr = conn->faddr.a6; - if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr)) - b->ip6h.daddr = c->ip6.addr_ll_seen; - else - b->ip6h.daddr = c->ip6.addr_seen; + b->ip6h.saddr = TAPFSIDE(conn)->faddr.a6; + b->ip6h.daddr = TAPFSIDE(conn)->eaddr.a6; memset(b->ip6h.flow_lbl, 0, 3); @@ -1739,26 +1739,21 @@ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd) /** * tcp_seq_init() - Calculate initial sequence number according to RFC 6528 * @c: Execution context - * @conn: TCP connection, with faddr, fport and eport populated + * @conn: TCP connection, with faddr, fport, eaddr, eport populated * @now: Current timestamp */ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, const struct timespec *now) { struct siphash_state state = SIPHASH_INIT(c->hash_secret); - union inany_addr aany; uint64_t hash; uint32_t ns; - if (CONN_V4(conn)) - inany_from_af(&aany, AF_INET, &c->ip4.addr); - else - inany_from_af(&aany, AF_INET6, &c->ip6.addr); - - inany_siphash_feed(&state, &conn->faddr); - inany_siphash_feed(&state, &aany); + inany_siphash_feed(&state, &TAPFSIDE(conn)->faddr); + inany_siphash_feed(&state, &TAPFSIDE(conn)->eaddr); hash = siphash_final(&state, 36, - (uint64_t)conn->fport << 16 | conn->eport); + (uint64_t)TAPFSIDE(conn)->fport << 16 | + TAPFSIDE(conn)->eport); /* 32ns ticks, overflows 32 bits every 137s */ ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; @@ -1925,8 +1920,6 @@ static void tcp_conn_from_tap(struct ctx *c, socklen_t sl; int s, mss; - (void)saddr; - if (!(flow = flow_alloc())) return; @@ -1953,6 +1946,10 @@ static void tcp_conn_from_tap(struct ctx *c, conn = &flow->tcp; conn->f.type = FLOW_TCP; + flowside_from_af(TAPFSIDE(conn), PIF_TAP, af, + daddr, ntohs(th->dest), saddr, ntohs(th->source)); + ASSERT(flowside_complete(TAPFSIDE(conn))); + conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -1972,8 +1969,6 @@ static void tcp_conn_from_tap(struct ctx *c, if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap))) conn->wnd_from_tap = 1; - inany_from_af(&conn->faddr, af, daddr); - if (af == AF_INET) { sa = (struct sockaddr *)&addr4; sl = sizeof(addr4); @@ -1982,9 +1977,6 @@ static void tcp_conn_from_tap(struct ctx *c, sl = sizeof(addr6); } - conn->fport = ntohs(th->dest); - conn->eport = ntohs(th->source); - conn->seq_init_from_tap = ntohl(th->seq); conn->seq_from_tap = conn->seq_init_from_tap + 1; conn->seq_ack_to_tap = conn->seq_from_tap; @@ -2674,10 +2666,21 @@ static void tcp_tap_conn_from_sock(struct ctx *c, conn->ws_to_tap = conn->ws_from_tap = 0; conn_event(c, conn, SOCK_ACCEPTED); - inany_from_sockaddr(&conn->faddr, &conn->fport, sa); - conn->eport = ref.port; + TAPFSIDE(conn)->pif = PIF_HOST; + inany_from_sockaddr(&TAPFSIDE(conn)->faddr, &TAPFSIDE(conn)->fport, sa); + tcp_snat_inbound(c, &TAPFSIDE(conn)->faddr); + + if (CONN_V4(conn)) { + inany_from_af(&TAPFSIDE(conn)->eaddr, AF_INET, &c->ip4.addr_seen); + } else { + if (IN6_IS_ADDR_LINKLOCAL(&TAPFSIDE(conn)->faddr.a6)) + TAPFSIDE(conn)->eaddr.a6 = c->ip6.addr_ll_seen; + else + TAPFSIDE(conn)->eaddr.a6 = c->ip6.addr_seen; + } + TAPFSIDE(conn)->eport = ref.port; - tcp_snat_inbound(c, &conn->faddr); + ASSERT(flowside_complete(TAPFSIDE(conn))); tcp_seq_init(c, conn, now); tcp_hash_insert(c, conn); diff --git a/tcp_conn.h b/tcp_conn.h index 1a4dcf2..8d9c7bd 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -23,9 +23,6 @@ * @ws_to_tap: Window scaling factor advertised to tap/guest * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS * @seq_dup_ack_approx: Last duplicate ACK number sent to tap - * @faddr: Guest side forwarding address (guest's remote address) - * @eport: Guest side endpoint port (guest's local port) - * @fport: Guest side forwarding port (guest's remote port) * @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap @@ -91,11 +88,6 @@ struct tcp_tap_conn { uint8_t seq_dup_ack_approx; - - union inany_addr faddr; - in_port_t eport; - in_port_t fport; - uint16_t wnd_from_tap; uint16_t wnd_to_tap; -- 2.43.0