public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: Jon Maloy <jmaloy@redhat.com>
Cc: passt-dev@passt.top, sbrivio@redhat.com, lvivier@redhat.com,
	dgibson@redhat.com
Subject: Re: [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available
Date: Mon, 20 May 2024 18:07:02 +1000	[thread overview]
Message-ID: <ZksEpogbe2iV9XR0@zatzit> (raw)
In-Reply-To: <20240517152414.1188282-3-jmaloy@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 7472 bytes --]

On Fri, May 17, 2024 at 11:24:13AM -0400, Jon Maloy wrote:
> >From linux-6.9.0 the kernel will contain
> commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option").
> 
> This new feature makes is possible to call recv_msg(MSG_PEEK) and make
> it start reading data from a given offset set by the SO_PEEK_OFF socket
> option. This way, we can avoid repeated reading of already read bytes of
> a received message, hence saving read cycles when forwarding TCP
> messages in the host->name space direction.
> 
> In this commit, we add functionality to leverage this feature when
> available, while we fall back to the previous behavior when not.
> 
> Measurements with iperf3 shows that throughput increases with 15-20
> percent in the host->namespace direction when this feature is used.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> 
> ---
> v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio.
>     - Moved initial set_peek_offset(0) to only the locations where the socket is set
>       to ESTABLISHED.
>     - Removed the per-packet synchronization between sk_peek_off and
>       already_sent. Instead only doing it in retransmit situations.
>     - The problem I found when trouble shooting the occasionally occurring
>       out of synch values between 'already_sent' and 'sk_peek_offset' may
>       have deeper implications that we may need to be investigate.
> 
> v3: - Rebased to most recent version of tcp.c, plus the previous
>       patch in this series.
>     - Some changes based on feedback from PASST team
> 
> v4: - Some small changes based on feedback from Stefan/David.
> 
> v5: - Re-added accidentally dropped set_peek_offset() line.
>       Thank you, David.
> ---
>  tcp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 51 insertions(+), 8 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 3a2350a..fa13292 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -511,6 +511,9 @@ static struct iovec	tcp6_l2_iov		[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  static struct iovec	tcp4_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  static struct iovec	tcp6_l2_flags_iov	[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>  
> +/* Does the kernel support TCP_PEEK_OFF? */
> +static bool peek_offset_cap;
> +
>  /* sendmsg() to socket */
>  static struct iovec	tcp_iov			[UIO_MAXIOV];
>  
> @@ -526,6 +529,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
>  int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
>  int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
>  
> +/**
> + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported
> + * @s:          Socket to update
> + * @offset:     Offset in bytes
> + */
> +static void tcp_set_peek_offset(int s, int offset)
> +{
> +	if (!peek_offset_cap)
> +		return;
> +
> +	if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset)))
> +		err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s);
> +}
> +
>  /**
>   * tcp_conn_epoll_events() - epoll events mask for given connection state
>   * @events:	Current connection events
> @@ -1273,6 +1290,7 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T
>  			continue;
>  
>  		conn->seq_to_tap = seq;
> +		tcp_set_peek_offset(conn->sock, seq - conn->seq_ack_from_tap);
>  	}
>  }
>  
> @@ -2199,14 +2217,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  	uint32_t already_sent, seq;
>  	struct iovec *iov;
>  
> +	/* How much have we read/sent since last received ack ? */
>  	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
> -
>  	if (SEQ_LT(already_sent, 0)) {
>  		/* RFC 761, section 2.1. */
>  		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
>  			   conn->seq_ack_from_tap, conn->seq_to_tap);
>  		conn->seq_to_tap = conn->seq_ack_from_tap;
>  		already_sent = 0;
> +		tcp_set_peek_offset(s, 0);
>  	}
>  
>  	if (!wnd_scaled || already_sent >= wnd_scaled) {
> @@ -2224,11 +2243,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  		iov_rem = (wnd_scaled - already_sent) % mss;
>  	}
>  
> -	mh_sock.msg_iov = iov_sock;
> -	mh_sock.msg_iovlen = fill_bufs + 1;
> -
> -	iov_sock[0].iov_base = tcp_buf_discard;
> -	iov_sock[0].iov_len = already_sent;
> +	/* Prepare iov according to kernel capability */
> +	if (!peek_offset_cap) {
> +		mh_sock.msg_iov = iov_sock;
> +		iov_sock[0].iov_base = tcp_buf_discard;
> +		iov_sock[0].iov_len = already_sent;
> +		mh_sock.msg_iovlen = fill_bufs + 1;
> +	} else {
> +		mh_sock.msg_iov = &iov_sock[1];
> +		mh_sock.msg_iovlen = fill_bufs;
> +	}
>  
>  	if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
>  	    (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
> @@ -2269,7 +2293,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
>  		return 0;
>  	}
>  
> -	sendlen = len - already_sent;
> +	sendlen = len;
> +	if (!peek_offset_cap)
> +		sendlen -= already_sent;
> +
>  	if (sendlen <= 0) {
>  		conn_flag(c, conn, STALLED);
>  		return 0;
> @@ -2440,6 +2467,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
>  			   "fast re-transmit, ACK: %u, previous sequence: %u",
>  			   max_ack_seq, conn->seq_to_tap);
>  		conn->seq_to_tap = max_ack_seq;
> +		tcp_set_peek_offset(conn->sock, 0);
>  		tcp_data_from_sock(c, conn);
>  	}
>  
> @@ -2532,6 +2560,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
>  	conn->seq_ack_to_tap = conn->seq_from_tap;
>  
>  	conn_event(c, conn, ESTABLISHED);
> +	tcp_set_peek_offset(conn->sock, 0);
>  
>  	/* The client might have sent data already, which we didn't
>  	 * dequeue waiting for SYN,ACK from tap -- check now.
> @@ -2612,6 +2641,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
>  			goto reset;
>  
>  		conn_event(c, conn, ESTABLISHED);
> +		tcp_set_peek_offset(conn->sock, 0);
>  
>  		if (th->fin) {
>  			conn->seq_from_tap++;
> @@ -2865,6 +2895,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
>  			flow_dbg(conn, "ACK timeout, retry");
>  			conn->retrans++;
>  			conn->seq_to_tap = conn->seq_ack_from_tap;
> +			tcp_set_peek_offset(conn->sock, 0);
>  			tcp_data_from_sock(c, conn);
>  			tcp_timer_ctl(c, conn);
>  		}
> @@ -3156,7 +3187,8 @@ static void tcp_sock_refill_init(const struct ctx *c)
>   */
>  int tcp_init(struct ctx *c)
>  {
> -	unsigned b;
> +	unsigned int b, optv = 0;
> +	int s;
>  
>  	for (b = 0; b < TCP_HASH_TABLE_SIZE; b++)
>  		tc_hash[b] = FLOW_SIDX_NONE;
> @@ -3180,6 +3212,17 @@ int tcp_init(struct ctx *c)
>  		NS_CALL(tcp_ns_socks_init, c);
>  	}
>  
> +	/* Probe for SO_PEEK_OFF support */
> +	s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
> +	if (s < 0) {
> +		warn("Temporary TCP socket creation failed");
> +	} else {
> +		if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int)))
> +			peek_offset_cap = true;
> +		close(s);
> +	}
> +	info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
> +
>  	return 0;
>  }
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

  reply	other threads:[~2024-05-20  9:50 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-05-17 15:24 [PATCH v6 0/3] Support for SO_PEEK_OFF socket option Jon Maloy
2024-05-17 15:24 ` [PATCH v6 1/3] tcp: move seq_to_tap update to when frame is queued Jon Maloy
2024-05-20  7:46   ` David Gibson
2024-05-17 15:24 ` [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy
2024-05-20  8:07   ` David Gibson [this message]
2024-05-17 15:24 ` [PATCH v6 3/3] tcp: allow retransmit when peer receive window is zero Jon Maloy
2024-05-21  5:51   ` David Gibson
2024-05-21 22:25     ` Jon Maloy
  -- strict thread matches above, loose matches on Subject: below --
2024-05-17 15:05 [PATCH v6 0/3] Support for SO_PEEK_OFF socket option Jon Maloy
2024-05-17 15:06 ` [PATCH v6 2/3] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZksEpogbe2iV9XR0@zatzit \
    --to=david@gibson.dropbear.id.au \
    --cc=dgibson@redhat.com \
    --cc=jmaloy@redhat.com \
    --cc=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).