public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: Stefano Brivio <sbrivio@redhat.com>
Cc: passt-dev@passt.top, jmaloy@redhat.com
Subject: Re: [PATCH v6 03/26] tcp, flow: Remove redundant information, repack connection structures
Date: Wed, 26 Jun 2024 10:23:49 +1000	[thread overview]
Message-ID: <ZntflZhjiLq8zSL4@zatzit> (raw)
In-Reply-To: <20240626002505.12f2b3b2@elisabeth>

[-- Attachment #1: Type: text/plain, Size: 12571 bytes --]

On Wed, Jun 26, 2024 at 12:25:05AM +0200, Stefano Brivio wrote:
> On Fri, 14 Jun 2024 16:13:25 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > Some information we explicitly store in the TCP connection is now
> > duplicated in the common flow structure.  Access it from there instead, and
> > remove it from the TCP specific structure.   With that done we can reorder
> > both the "tap" and "splice" TCP structures a bit to get better packing for
> > the new combined flow table entries.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  tcp.c          | 52 ++++++++++++++++++++++++++------------------------
> >  tcp_conn.h     | 40 +++++++++++++++-----------------------
> >  tcp_internal.h |  6 +++++-
> >  3 files changed, 47 insertions(+), 51 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index c6cd0c72..30ad3dd4 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -333,8 +333,6 @@
> >  
> >  #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */
> >  
> > -#define TAPSIDE(conn_)	((conn_)->f.pif[1] == PIF_TAP)
> > -
> >  #define CONN_IS_CLOSING(conn)						\
> >  	(((conn)->events & ESTABLISHED) &&				\
> >  	 ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
> > @@ -635,10 +633,11 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
> >   */
> >  static int tcp_rtt_dst_low(const struct tcp_tap_conn *conn)
> >  {
> > +	const struct flowside *tapside = TAPFLOW(conn);
> >  	int i;
> >  
> >  	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
> > -		if (inany_equals(&conn->faddr, low_rtt_dst + i))
> > +		if (inany_equals(&tapside->faddr, low_rtt_dst + i))
> >  			return 1;
> >  
> >  	return 0;
> > @@ -653,6 +652,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
> >  			      const struct tcp_info *tinfo)
> >  {
> >  #ifdef HAS_MIN_RTT
> > +	const struct flowside *tapside = TAPFLOW(conn);
> >  	int i, hole = -1;
> >  
> >  	if (!tinfo->tcpi_min_rtt ||
> > @@ -660,7 +660,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
> >  		return;
> >  
> >  	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
> > -		if (inany_equals(&conn->faddr, low_rtt_dst + i))
> > +		if (inany_equals(&tapside->faddr, low_rtt_dst + i))
> >  			return;
> >  		if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
> >  			hole = i;
> > @@ -672,7 +672,7 @@ static void tcp_rtt_dst_check(const struct tcp_tap_conn *conn,
> >  	if (hole == -1)
> >  		return;
> >  
> > -	low_rtt_dst[hole++] = conn->faddr;
> > +	low_rtt_dst[hole++] = tapside->faddr;
> >  	if (hole == LOW_RTT_TABLE_SIZE)
> >  		hole = 0;
> >  	inany_from_af(low_rtt_dst + hole, AF_INET6, &in6addr_any);
> > @@ -827,8 +827,10 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn,
> >  			  const union inany_addr *faddr,
> >  			  in_port_t eport, in_port_t fport)
> >  {
> > -	if (inany_equals(&conn->faddr, faddr) &&
> > -	    conn->eport == eport && conn->fport == fport)
> > +	const struct flowside *tapside = TAPFLOW(conn);
> > +
> > +	if (inany_equals(&tapside->faddr, faddr) &&
> > +	    tapside->eport == eport && tapside->fport == fport)
> >  		return 1;
> >  
> >  	return 0;
> > @@ -862,7 +864,10 @@ static uint64_t tcp_hash(const struct ctx *c, const union inany_addr *faddr,
> >  static uint64_t tcp_conn_hash(const struct ctx *c,
> >  			      const struct tcp_tap_conn *conn)
> >  {
> > -	return tcp_hash(c, &conn->faddr, conn->eport, conn->fport);
> > +	const struct flowside *tapside = TAPFLOW(conn);
> > +
> > +	return tcp_hash(c, &tapside->faddr, tapside->eport,
> > +			tapside->fport);
> >  }
> >  
> >  /**
> > @@ -998,10 +1003,12 @@ void tcp_defer_handler(struct ctx *c)
> >   * @seq:	Sequence number
> >   */
> >  static void tcp_fill_header(struct tcphdr *th,
> > -			       const struct tcp_tap_conn *conn, uint32_t seq)
> > +			    const struct tcp_tap_conn *conn, uint32_t seq)
> >  {
> > -	th->source = htons(conn->fport);
> > -	th->dest = htons(conn->eport);
> > +	const struct flowside *tapside = TAPFLOW(conn);
> > +
> > +	th->source = htons(tapside->fport);
> > +	th->dest = htons(tapside->eport);
> >  	th->seq = htonl(seq);
> >  	th->ack_seq = htonl(conn->seq_ack_to_tap);
> >  	if (conn->events & ESTABLISHED)	{
> > @@ -1033,7 +1040,8 @@ static size_t tcp_fill_headers4(const struct ctx *c,
> >  				size_t dlen, const uint16_t *check,
> >  				uint32_t seq)
> >  {
> > -	const struct in_addr *a4 = inany_v4(&conn->faddr);
> > +	const struct flowside *tapside = TAPFLOW(conn);
> > +	const struct in_addr *a4 = inany_v4(&tapside->faddr);
> >  	size_t l4len = dlen + sizeof(*th);
> >  	size_t l3len = l4len + sizeof(*iph);
> >  
> > @@ -1075,10 +1083,11 @@ static size_t tcp_fill_headers6(const struct ctx *c,
> >  				struct ipv6hdr *ip6h, struct tcphdr *th,
> >  				size_t dlen, uint32_t seq)
> >  {
> > +	const struct flowside *tapside = TAPFLOW(conn);
> >  	size_t l4len = dlen + sizeof(*th);
> >  
> >  	ip6h->payload_len = htons(l4len);
> > -	ip6h->saddr = conn->faddr.a6;
> > +	ip6h->saddr = tapside->faddr.a6;
> >  	if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr))
> >  		ip6h->daddr = c->ip6.addr_ll_seen;
> >  	else
> > @@ -1117,7 +1126,8 @@ size_t tcp_l2_buf_fill_headers(const struct ctx *c,
> >  			       struct iovec *iov, size_t dlen,
> >  			       const uint16_t *check, uint32_t seq)
> >  {
> > -	const struct in_addr *a4 = inany_v4(&conn->faddr);
> > +	const struct flowside *tapside = TAPFLOW(conn);
> > +	const struct in_addr *a4 = inany_v4(&tapside->faddr);
> >  
> >  	if (a4) {
> >  		return tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
> > @@ -1420,6 +1430,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
> >  			 const struct timespec *now)
> >  {
> >  	struct siphash_state state = SIPHASH_INIT(c->hash_secret);
> > +	const struct flowside *tapside = TAPFLOW(conn);
> >  	union inany_addr aany;
> >  	uint64_t hash;
> >  	uint32_t ns;
> > @@ -1429,10 +1440,10 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn,
> >  	else
> >  		inany_from_af(&aany, AF_INET6, &c->ip6.addr);
> >  
> > -	inany_siphash_feed(&state, &conn->faddr);
> > +	inany_siphash_feed(&state, &tapside->faddr);
> >  	inany_siphash_feed(&state, &aany);
> >  	hash = siphash_final(&state, 36,
> > -			     (uint64_t)conn->fport << 16 | conn->eport);
> > +			     (uint64_t)tapside->fport << 16 | tapside->eport);
> >  
> >  	/* 32ns ticks, overflows 32 bits every 137s */
> >  	ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5;
> > @@ -1707,11 +1718,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
> >  	if (!(conn->wnd_from_tap = (htons(th->window) >> conn->ws_from_tap)))
> >  		conn->wnd_from_tap = 1;
> >  
> > -	inany_from_af(&conn->faddr, af, daddr);
> > -
> > -	conn->fport = dstport;
> > -	conn->eport = srcport;
> > -
> >  	conn->seq_init_from_tap = ntohl(th->seq);
> >  	conn->seq_from_tap = conn->seq_init_from_tap + 1;
> >  	conn->seq_ack_to_tap = conn->seq_from_tap;
> > @@ -2254,10 +2260,6 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport,
> >  	conn->ws_to_tap = conn->ws_from_tap = 0;
> >  	conn_event(c, conn, SOCK_ACCEPTED);
> >  
> > -	conn->faddr = saddr;
> > -	conn->fport = srcport;
> > -	conn->eport = dstport;
> > -
> >  	tcp_seq_init(c, conn, now);
> >  	tcp_hash_insert(c, conn);
> >  
> > diff --git a/tcp_conn.h b/tcp_conn.h
> > index 5f8c8fb6..b741ce32 100644
> > --- a/tcp_conn.h
> > +++ b/tcp_conn.h
> > @@ -13,19 +13,16 @@
> >   * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
> >   * @f:			Generic flow information
> >   * @in_epoll:		Is the connection in the epoll set?
> > + * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
> > + * @ws_from_tap:	Window scaling factor advertised from tap/guest
> > + * @ws_to_tap:		Window scaling factor advertised to tap/guest
> >   * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
> >   * @sock:		Socket descriptor number
> >   * @events:		Connection events, implying connection states
> >   * @timer:		timerfd descriptor for timeout events
> >   * @flags:		Connection flags representing internal attributes
> > - * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
> > - * @ws_from_tap:	Window scaling factor advertised from tap/guest
> > - * @ws_to_tap:		Window scaling factor advertised to tap/guest
> >   * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
> >   * @seq_dup_ack_approx:	Last duplicate ACK number sent to tap
> > - * @faddr:		Guest side forwarding address (guest's remote address)
> > - * @eport:		Guest side endpoint port (guest's local port)
> > - * @fport:		Guest side forwarding port (guest's remote port)
> >   * @wnd_from_tap:	Last window size from tap, unscaled (as received)
> >   * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
> >   * @seq_to_tap:		Next sequence for packets to tap
> > @@ -49,6 +46,10 @@ struct tcp_tap_conn {
> >  	unsigned int	ws_from_tap	:TCP_WS_BITS;
> >  	unsigned int	ws_to_tap	:TCP_WS_BITS;
> >  
> > +#define TCP_MSS_BITS			14
> > +	unsigned int	tap_mss		:TCP_MSS_BITS;
> > +#define MSS_SET(conn, mss)	(conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
> > +#define MSS_GET(conn)		(conn->tap_mss << (16 - TCP_MSS_BITS))
> >  
> >  	int		sock		:FD_REF_BITS;
> >  
> > @@ -77,13 +78,6 @@ struct tcp_tap_conn {
> >  #define ACK_TO_TAP_DUE		BIT(3)
> >  #define ACK_FROM_TAP_DUE	BIT(4)
> >  
> > -
> > -#define TCP_MSS_BITS			14
> > -	unsigned int	tap_mss		:TCP_MSS_BITS;
> > -#define MSS_SET(conn, mss)	(conn->tap_mss = (mss >> (16 - TCP_MSS_BITS)))
> > -#define MSS_GET(conn)		(conn->tap_mss << (16 - TCP_MSS_BITS))
> > -
> > -
> >  #define SNDBUF_BITS		24
> >  	unsigned int	sndbuf		:SNDBUF_BITS;
> >  #define SNDBUF_SET(conn, bytes)	(conn->sndbuf = ((bytes) >> (32 - SNDBUF_BITS)))
> > @@ -91,11 +85,6 @@ struct tcp_tap_conn {
> >  
> >  	uint8_t		seq_dup_ack_approx;
> >  
> > -
> > -	union inany_addr faddr;
> > -	in_port_t	eport;
> > -	in_port_t	fport;
> > -
> >  	uint16_t	wnd_from_tap;
> >  	uint16_t	wnd_to_tap;
> >  
> > @@ -109,22 +98,24 @@ struct tcp_tap_conn {
> >  /**
> >   * struct tcp_splice_conn - Descriptor for a spliced TCP connection
> >   * @f:			Generic flow information
> > - * @in_epoll:		Is the connection in the epoll set?
> >   * @s:			File descriptor for sockets
> >   * @pipe:		File descriptors for pipes
> > - * @events:		Events observed/actions performed on connection
> > - * @flags:		Connection flags (attributes, not events)
> >   * @read:		Bytes read (not fully written to other side in one shot)
> >   * @written:		Bytes written (not fully written from one other side read)
> > -*/
> > + * @events:		Events observed/actions performed on connection
> > + * @flags:		Connection flags (attributes, not events)
> > + * @in_epoll:		Is the connection in the epoll set?
> > + */
> >  struct tcp_splice_conn {
> >  	/* Must be first element */
> >  	struct flow_common f;
> >  
> > -	bool in_epoll	:1;
> >  	int s[SIDES];
> >  	int pipe[SIDES][2];
> >  
> > +	uint32_t read[SIDES];
> > +	uint32_t written[SIDES];
> > +
> >  	uint8_t events;
> >  #define SPLICE_CLOSED			0
> >  #define SPLICE_CONNECT			BIT(0)
> > @@ -144,8 +135,7 @@ struct tcp_splice_conn {
> >  #define RCVLOWAT_ACT_1			BIT(4)
> >  #define CLOSING				BIT(5)
> >  
> > -	uint32_t read[SIDES];
> > -	uint32_t written[SIDES];
> > +		bool in_epoll	:1;
> 
> Excess tab.

Oops, fixed.

> 
> >  };
> >  
> >  /* Socket pools */
> > diff --git a/tcp_internal.h b/tcp_internal.h
> > index 51aaa169..4f61e5c3 100644
> > --- a/tcp_internal.h
> > +++ b/tcp_internal.h
> > @@ -39,7 +39,11 @@
> >  #define OPT_SACKP	4
> >  #define OPT_SACK	5
> >  #define OPT_TS		8
> > -#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
> > +
> > +#define TAPSIDE(conn_)	((conn_)->f.pif[1] == PIF_TAP)
> > +#define TAPFLOW(conn_)	(&((conn_)->f.side[TAPSIDE(conn_)]))
> > +
> > +#define CONN_V4(conn)		(!!inany_v4(&TAPFLOW(conn)->faddr))
> >  #define CONN_V6(conn)		(!CONN_V4(conn))
> >  
> >  /*
> 
> I reviewed up to 7/26 by the way, no further comments until that point.
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

  reply	other threads:[~2024-06-26  0:34 UTC|newest]

Thread overview: 35+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-14  6:13 [PATCH v6 00/26] RFC: Unified flow table David Gibson
2024-06-14  6:13 ` [PATCH v6 01/26] flow: Common address information for initiating side David Gibson
2024-06-25 22:23   ` Stefano Brivio
2024-06-26  0:19     ` David Gibson
2024-06-14  6:13 ` [PATCH v6 02/26] flow: Common address information for target side David Gibson
2024-06-25 22:23   ` Stefano Brivio
2024-06-26  0:25     ` David Gibson
2024-06-14  6:13 ` [PATCH v6 03/26] tcp, flow: Remove redundant information, repack connection structures David Gibson
2024-06-25 22:25   ` Stefano Brivio
2024-06-26  0:23     ` David Gibson [this message]
2024-06-14  6:13 ` [PATCH v6 04/26] tcp: Obtain guest address from flowside David Gibson
2024-06-14  6:13 ` [PATCH v6 05/26] tcp: Manage outbound address via flow table David Gibson
2024-06-14  6:13 ` [PATCH v6 06/26] tcp: Simplify endpoint validation using flowside information David Gibson
2024-06-14  6:13 ` [PATCH v6 07/26] tcp_splice: Eliminate SPLICE_V6 flag David Gibson
2024-06-14  6:13 ` [PATCH v6 08/26] tcp, flow: Replace TCP specific hash function with general flow hash David Gibson
2024-06-14  6:13 ` [PATCH v6 09/26] flow, tcp: Generalise TCP hash table to general flow hash table David Gibson
2024-06-14  6:13 ` [PATCH v6 10/26] tcp: Re-use flow hash for initial sequence number generation David Gibson
2024-06-14  6:13 ` [PATCH v6 11/26] icmp: Remove redundant id field from flow table entry David Gibson
2024-06-14  6:13 ` [PATCH v6 12/26] icmp: Obtain destination addresses from the flowsides David Gibson
2024-06-14  6:13 ` [PATCH v6 13/26] icmp: Look up ping flows using flow hash David Gibson
2024-06-14  6:13 ` [PATCH v6 14/26] icmp: Eliminate icmp_id_map David Gibson
2024-06-14  6:13 ` [PATCH v6 15/26] icmp: Manage outbound socket address via flow table David Gibson
2024-06-14  6:13 ` [PATCH v6 16/26] flow, tcp: Flow based NAT and port forwarding for TCP David Gibson
2024-06-26 22:49   ` Stefano Brivio
2024-06-27  5:55     ` David Gibson
2024-06-14  6:13 ` [PATCH v6 17/26] flow, icmp: Use general flow forwarding rules for ICMP David Gibson
2024-06-14  6:13 ` [PATCH v6 18/26] fwd: Update flow forwarding logic for UDP David Gibson
2024-06-14  6:13 ` [PATCH v6 19/26] udp: Create flow table entries " David Gibson
2024-06-14  6:13 ` [PATCH v6 20/26] udp: Direct traffic from tap according to flow table David Gibson
2024-06-14  6:13 ` [PATCH v6 21/26] udp: Direct traffic from host to guest " David Gibson
2024-06-14  6:13 ` [PATCH v6 22/26] udp: Direct spliced traffic " David Gibson
2024-06-14  6:13 ` [PATCH v6 23/26] udp: Remove 'splicesrc' tracking David Gibson
2024-06-14  6:13 ` [PATCH v6 24/26] udp: Remove tap port flags field David Gibson
2024-06-14  6:13 ` [PATCH v6 25/26] udp: Remove rdelta port forwarding maps David Gibson
2024-06-14  6:13 ` [PATCH v6 26/26] udp: Eliminate 'splice' flag from epoll reference David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZntflZhjiLq8zSL4@zatzit \
    --to=david@gibson.dropbear.id.au \
    --cc=jmaloy@redhat.com \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).