From: David Gibson <david@gibson.dropbear.id.au>
To: passt-dev@passt.top, Stefano Brivio <sbrivio@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v4 02/16] tcp: Maintain flowside information for "tap" connections
Date: Fri, 3 May 2024 11:11:21 +1000 [thread overview]
Message-ID: <20240503011135.2924437-3-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20240503011135.2924437-1-david@gibson.dropbear.id.au>
tcp_tap_conn has several fields to track addresses and ports as seen
by the guest/namespace. We now have general fields for this in the
common flowside struct so use those instead of protocol specific
fields. The flowside also has space for the guest side endpoint
address (local address from the guest's PoV) so we fill that in as
well.
We didn't previously store equivalent information for the connection
as it appears to the host; that was implicit in the state of the host
side socket. For future generalisations of flow/connection tracking,
we're going to need that information, so populate the other flowside
in each flow table entry with as much of this information as we can
easily obtain. For connections initiated by the guest that's the
endpoint address and port. To get the forwarding address and port
we'd need to call getsockname() in general, so leave that blank for
now. For connections initiated from outside, we also have the
endpoint address from accept(). We have the forwarding port from the
epoll ref, but we leave the forwarding address blank.
For now we just fill the information in without really using it for
anything.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
flow.h | 1 -
tcp.c | 88 +++++++++++++++++++++++++++++++++++++-----------------
tcp_conn.h | 8 -----
3 files changed, 60 insertions(+), 37 deletions(-)
diff --git a/flow.h b/flow.h
index f7fb537..88caa76 100644
--- a/flow.h
+++ b/flow.h
@@ -85,7 +85,6 @@ static inline void flowside_from_inany(struct flowside *fside, uint8_t pif,
* If NULL is given for either address, the appropriate unspecified/any address
* for the address family is substituted.
*/
-/* cppcheck-suppress unusedFunction */
static inline void flowside_from_af(struct flowside *fside,
uint8_t pif, sa_family_t af,
const void *faddr, in_port_t fport,
diff --git a/tcp.c b/tcp.c
index 21d0af0..1835b86 100644
--- a/tcp.c
+++ b/tcp.c
@@ -372,7 +372,7 @@
#define OPT_SACK 5
#define OPT_TS 8
-#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
+#define CONN_V4(conn) (!!inany_v4(&conn->f.side[TAPSIDE].faddr))
#define CONN_V6(conn) (!CONN_V4(conn))
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
@@ -795,10 +795,11 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
*/
static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
{
+ const struct flowside *tapside = &conn->f.side[TAPSIDE];
int i;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
- if (inany_equals(&conn->faddr, low_rtt_dst + i))
+ if (inany_equals(&tapside->faddr, low_rtt_dst + i))
return 1;
return 0;
@@ -813,6 +814,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
const struct tcp_info *tinfo)
{
#ifdef HAS_MIN_RTT
+ const struct flowside *tapside = &conn->f.side[TAPSIDE];
int i, hole = -1;
if (!tinfo->tcpi_min_rtt ||
@@ -820,7 +822,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
return;
for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
- if (inany_equals(&conn->faddr, low_rtt_dst + i))
+ if (inany_equals(&tapside->faddr, low_rtt_dst + i))
return;
if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
hole = i;
@@ -832,7 +834,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
if (hole == -1)
return;
- low_rtt_dst[hole++] = conn->faddr;
+ low_rtt_dst[hole++] = tapside->faddr;
if (hole == LOW_RTT_TABLE_SIZE)
hole = 0;
inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
@@ -1085,8 +1087,10 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn,
const union inany_addr *faddr,
in_port_t eport, in_port_t fport)
{
- if (inany_equals(&conn->faddr, faddr) &&
- conn->eport == eport && conn->fport == fport)
+ const struct flowside *tapside = &conn->f.side[TAPSIDE];
+
+ if (inany_equals(&tapside->faddr, faddr) &&
+ tapside->eport == eport && tapside->fport == fport)
return 1;
return 0;
@@ -1120,7 +1124,9 @@ static uint64_t tcp_hash(const struct ctx *c, const union inany_addr *faddr,
static uint64_t tcp_conn_hash(const struct ctx *c,
const struct tcp_tap_conn *conn)
{
- return tcp_hash(c, &conn->faddr, conn->eport, conn->fport);
+ const struct flowside *tapside = &conn->f.side[TAPSIDE];
+
+ return tcp_hash(c, &tapside->faddr, tapside->eport, tapside->fport);
}
/**
@@ -1302,10 +1308,12 @@ void tcp_defer_handler(struct ctx *c)
* @seq: Sequence number
*/
static void tcp_fill_header(struct tcphdr *th,
- const struct tcp_tap_conn *conn, uint32_t seq)
+ const struct tcp_tap_conn *conn, uint32_t seq)
{
- th->source = htons(conn->fport);
- th->dest = htons(conn->eport);
+ const struct flowside *tapside = &conn->f.side[TAPSIDE];
+
+ th->source = htons(tapside->fport);
+ th->dest = htons(tapside->eport);
th->seq = htonl(seq);
th->ack_seq = htonl(conn->seq_ack_to_tap);
if (conn->events & ESTABLISHED) {
@@ -1337,7 +1345,8 @@ static size_t tcp_fill_headers4(const struct ctx *c,
size_t dlen, const uint16_t *check,
uint32_t seq)
{
- const struct in_addr *a4 = inany_v4(&conn->faddr);
+ const struct flowside *tapside = &conn->f.side[TAPSIDE];
+ const struct in_addr *a4 = inany_v4(&tapside->faddr);
size_t l4len = dlen + sizeof(*th);
size_t l3len = l4len + sizeof(*iph);
@@ -1379,10 +1388,11 @@ static size_t tcp_fill_headers6(const struct ctx *c,
struct ipv6hdr *ip6h, struct tcphdr *th,
size_t dlen, uint32_t seq)
{
+ const struct flowside *tapside = &conn->f.side[TAPSIDE];
size_t l4len = dlen + sizeof(*th);
ip6h->payload_len = htons(l4len);
- ip6h->saddr = conn->faddr.a6;
+ ip6h->saddr = tapside->faddr.a6;
if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
ip6h->daddr = c->ip6.addr_ll_seen;
else
@@ -1421,9 +1431,7 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
struct iovec *iov, size_t dlen,
const uint16_t *check, uint32_t seq)
{
- const struct in_addr *a4 = inany_v4(&conn->faddr);
-
- if (a4) {
+ if (CONN_V4(conn)) {
return tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_IP].iov_base,
iov[TCP_IOV_PAYLOAD].iov_base, dlen,
@@ -1738,7 +1746,7 @@ static void tcp_tap_window_update(struct tcp_tap_conn *conn, unsigned wnd)
/**
* tcp_seq_init() - Calculate initial sequence number according to RFC 6528
* @c: Execution context
- * @conn: TCP connection, with faddr, fport and eport populated
+ * @conn: TCP connection, with tap flowside faddr, fport and eport
* @now: Current timestamp
*/
static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
@@ -1746,6 +1754,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
{
struct siphash_state state = SIPHASH_INIT(c->hash_secret);
union inany_addr aany;
+ const struct flowside *tapside = &conn->f.side[TAPSIDE];
uint64_t hash;
uint32_t ns;
@@ -1754,10 +1763,10 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
else
inany_from_af(&aany, AF_INET6, &c->ip6.addr);
- inany_siphash_feed(&state, &conn->faddr);
+ inany_siphash_feed(&state, &tapside->faddr);
inany_siphash_feed(&state, &aany);
hash = siphash_final(&state, 36,
- (uint64_t)conn->fport << 16 | conn->eport);
+ (uint64_t)tapside->fport << 16 | tapside->eport);
/* 32ns ticks, overflows 32 bits every 137s */
ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
@@ -1945,6 +1954,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
.sin6_port = htons(dstport),
.sin6_addr = *(struct in6_addr *)daddr,
};
+ struct flowside *tapside, *sockside;
const struct sockaddr *sa;
struct tcp_tap_conn *conn;
union flow *flow;
@@ -1954,6 +1964,11 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
if (!(flow = flow_alloc()))
return;
+ tapside = &flow->f.side[TAPSIDE];
+ sockside = &flow->f.side[SOCKSIDE];
+
+ flowside_from_af(tapside, PIF_TAP, af, daddr, dstport, saddr, srcport);
+
if (af == AF_INET) {
if (IN4_IS_ADDR_UNSPECIFIED(saddr) ||
IN4_IS_ADDR_BROADCAST(saddr) ||
@@ -2026,19 +2041,19 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
conn->wnd_from_tap = 1;
- inany_from_af(&conn->faddr, af, daddr);
+ sockside->pif = PIF_HOST;
+ sockside->eport = dstport;
if (af == AF_INET) {
+ inany_from_af(&sockside->eaddr, AF_INET, &addr4.sin_addr);
sa = (struct sockaddr *)&addr4;
sl = sizeof(addr4);
} else {
+ inany_from_af(&sockside->eaddr, AF_INET6, &addr6.sin6_addr);
sa = (struct sockaddr *)&addr6;
sl = sizeof(addr6);
}
- conn->fport = dstport;
- conn->eport = srcport;
-
conn->seq_init_from_tap = ntohl(th->seq);
conn->seq_from_tap = conn->seq_init_from_tap + 1;
conn->seq_ack_to_tap = conn->seq_from_tap;
@@ -2724,18 +2739,35 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport,
const union sockaddr_inany *sa,
const struct timespec *now)
{
- struct tcp_tap_conn *conn = FLOW_START(flow, FLOW_TCP, tcp, SOCKSIDE);
+ struct flowside *sockside = &flow->f.side[SOCKSIDE];
+ struct flowside *tapside = &flow->f.side[TAPSIDE];
+ struct tcp_tap_conn *conn;
+
+ sockside->pif = PIF_HOST;
+ inany_from_sockaddr(&sockside->eaddr, &sockside->eport, sa);
+ sockside->fport = dstport;
+
+ tapside->pif = PIF_TAP;
+ tapside->faddr = sockside->eaddr;
+ tapside->fport = sockside->eport;
+ tcp_snat_inbound(c, &tapside->faddr);
+ if (CONN_V4(flow)) {
+ inany_from_af(&tapside->eaddr, AF_INET, &c->ip4.addr_seen);
+ } else {
+ if (IN6_IS_ADDR_LINKLOCAL(&tapside->faddr.a6))
+ tapside->eaddr.a6 = c->ip6.addr_ll_seen;
+ else
+ tapside->eaddr.a6 = c->ip6.addr_seen;
+ }
+ tapside->eport = dstport + c->tcp.fwd_in.delta[dstport];
+
+ conn = FLOW_START(flow, FLOW_TCP, tcp, SOCKSIDE);
conn->sock = s;
conn->timer = -1;
conn->ws_to_tap = conn->ws_from_tap = 0;
conn_event(c, conn, SOCK_ACCEPTED);
- inany_from_sockaddr(&conn->faddr, &conn->fport, sa);
- conn->eport = dstport + c->tcp.fwd_in.delta[dstport];
-
- tcp_snat_inbound(c, &conn->faddr);
-
tcp_seq_init(c, conn, now);
tcp_hash_insert(c, conn);
diff --git a/tcp_conn.h b/tcp_conn.h
index 1a07dd5..f55f144 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -23,9 +23,6 @@
* @ws_to_tap: Window scaling factor advertised to tap/guest
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
- * @faddr: Guest side forwarding address (guest's remote address)
- * @eport: Guest side endpoint port (guest's local port)
- * @fport: Guest side forwarding port (guest's remote port)
* @wnd_from_tap: Last window size from tap, unscaled (as received)
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
* @seq_to_tap: Next sequence for packets to tap
@@ -91,11 +88,6 @@ struct tcp_tap_conn {
uint8_t seq_dup_ack_approx;
-
- union inany_addr faddr;
- in_port_t eport;
- in_port_t fport;
-
uint16_t wnd_from_tap;
uint16_t wnd_to_tap;
--
@@ -23,9 +23,6 @@
* @ws_to_tap: Window scaling factor advertised to tap/guest
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
* @seq_dup_ack_approx: Last duplicate ACK number sent to tap
- * @faddr: Guest side forwarding address (guest's remote address)
- * @eport: Guest side endpoint port (guest's local port)
- * @fport: Guest side forwarding port (guest's remote port)
* @wnd_from_tap: Last window size from tap, unscaled (as received)
* @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
* @seq_to_tap: Next sequence for packets to tap
@@ -91,11 +88,6 @@ struct tcp_tap_conn {
uint8_t seq_dup_ack_approx;
-
- union inany_addr faddr;
- in_port_t eport;
- in_port_t fport;
-
uint16_t wnd_from_tap;
uint16_t wnd_to_tap;
--
2.44.0
next prev parent reply other threads:[~2024-05-03 1:11 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-05-03 1:11 [PATCH v4 00/16] RFC: Unified flow table David Gibson
2024-05-03 1:11 ` [PATCH v4 01/16] flow: Common data structures for tracking flow addresses David Gibson
2024-05-13 18:07 ` Stefano Brivio
2024-05-14 0:11 ` David Gibson
2024-05-03 1:11 ` David Gibson [this message]
2024-05-13 18:07 ` [PATCH v4 02/16] tcp: Maintain flowside information for "tap" connections Stefano Brivio
2024-05-14 0:15 ` David Gibson
2024-05-03 1:11 ` [PATCH v4 03/16] tcp_splice: Maintain flowside information for spliced connections David Gibson
2024-05-03 1:11 ` [PATCH v4 04/16] tcp: Obtain guest address from flowside David Gibson
2024-05-13 18:07 ` Stefano Brivio
2024-05-14 0:18 ` David Gibson
2024-05-03 1:11 ` [PATCH v4 05/16] tcp: Simplify endpoint validation using flowside information David Gibson
2024-05-03 1:11 ` [PATCH v4 06/16] tcp, tcp_splice: Construct sockaddrs for connect() from flowside David Gibson
2024-05-03 1:11 ` [PATCH v4 07/16] tcp_splice: Eliminate SPLICE_V6 flag David Gibson
2024-05-03 1:11 ` [PATCH v4 08/16] tcp, flow: Replace TCP specific hash function with general flow hash David Gibson
2024-05-03 1:11 ` [PATCH v4 09/16] flow, tcp: Generalise TCP hash table to general flow hash table David Gibson
2024-05-03 1:11 ` [PATCH v4 10/16] tcp: Re-use flow hash for initial sequence number generation David Gibson
2024-05-03 1:11 ` [PATCH v4 11/16] icmp: Populate flowside information David Gibson
2024-05-03 1:11 ` [PATCH v4 12/16] icmp: Use flowsides as the source of truth wherever possible David Gibson
2024-05-03 1:11 ` [PATCH v4 13/16] icmp: Look up ping flows using flow hash David Gibson
2024-05-03 1:11 ` [PATCH v4 14/16] icmp: Eliminate icmp_id_map David Gibson
2024-05-03 1:11 ` [PATCH v4 15/16] flow, tcp: flow based NAT and port forwarding for TCP David Gibson
2024-05-03 1:11 ` [PATCH v4 16/16] flow, icmp: Use general flow forwarding rules for ICMP David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240503011135.2924437-3-david@gibson.dropbear.id.au \
--to=david@gibson.dropbear.id.au \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).