On Mon, Dec 08, 2025 at 08:20:19AM +0100, Stefano Brivio wrote:
> ...instead of checking if the current sending buffer is less than
> SNDBUF_SMALL, because this isn't simply an optimisation to coalesce
> ACK segments: we rely on having enough data at once from the sender
> to make the buffer grow by means of TCP buffer size tuning
> implemented in the Linux kernel.
> 
> This is important if we're trying to maximise throughput, but not
> desirable for interactive traffic, where we want to be transparent as
> possible and avoid introducing unnecessary latency.
> 
> Use the tcpi_delivery_rate field reported by the Linux kernel, if
> available, to calculate the current bandwidth-delay product: if it's
> significantly smaller than the available sending buffer, conclude that
> we're not bandwidth-bound and this is likely to be interactive
> traffic, so acknowledge data only as it's acknowledged by the peer.
> 
> Conversely, if the bandwidth-delay product is comparable to the size
> of the sending buffer (more than 5%), we're probably bandwidth-bound
> or... bound to be: acknowledge everything in that case.
> 
> Signed-off-by: Stefano Brivio <sbrivio@redhat.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  tcp.c | 45 +++++++++++++++++++++++++++++++++------------
>  1 file changed, 33 insertions(+), 12 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index b2e4174..923c1f2 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -353,6 +353,9 @@ enum {
>  #define LOW_RTT_TABLE_SIZE		8
>  #define LOW_RTT_THRESHOLD		10 /* us */
>  
> +/* Ratio of buffer to bandwidth * delay product implying interactive traffic */
> +#define SNDBUF_TO_BW_DELAY_INTERACTIVE	/* > */ 20 /* (i.e. < 5% of buffer) */
> +
>  #define ACK_IF_NEEDED	0		/* See tcp_send_flag() */
>  
>  #define CONN_IS_CLOSING(conn)						\
> @@ -426,11 +429,13 @@ socklen_t tcp_info_size;
>  	  sizeof(((struct tcp_info_linux *)NULL)->tcpi_##f_)) <= tcp_info_size)
>  
>  /* Kernel reports sending window in TCP_INFO (kernel commit 8f7baad7f035) */
> -#define snd_wnd_cap	tcp_info_cap(snd_wnd)
> +#define snd_wnd_cap		tcp_info_cap(snd_wnd)
>  /* Kernel reports bytes acked in TCP_INFO (kernel commit 0df48c26d84) */
> -#define bytes_acked_cap	tcp_info_cap(bytes_acked)
> +#define bytes_acked_cap		tcp_info_cap(bytes_acked)
>  /* Kernel reports minimum RTT in TCP_INFO (kernel commit cd9b266095f4) */
> -#define min_rtt_cap	tcp_info_cap(min_rtt)
> +#define min_rtt_cap		tcp_info_cap(min_rtt)
> +/* Kernel reports delivery rate in TCP_INFO (kernel commit eb8329e0a04d) */
> +#define delivery_rate_cap	tcp_info_cap(delivery_rate)
>  
>  /* sendmsg() to socket */
>  static struct iovec	tcp_iov			[UIO_MAXIOV];
> @@ -1050,6 +1055,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
>  	socklen_t sl = sizeof(*tinfo);
>  	struct tcp_info_linux tinfo_new;
>  	uint32_t new_wnd_to_tap = prev_wnd_to_tap;
> +	bool ack_everything = true;
>  	int s = conn->sock;
>  
>  	/* At this point we could ack all the data we've accepted for forwarding
> @@ -1059,7 +1065,8 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
>  	 * control behaviour.
>  	 *
>  	 * For it to be possible and worth it we need:
> -	 *  - The TCP_INFO Linux extension which gives us the peer acked bytes
> +	 *  - The TCP_INFO Linux extensions which give us the peer acked bytes
> +	 *    and the delivery rate (outbound bandwidth at receiver)
>  	 *  - Not to be told not to (force_seq)
>  	 *  - Not half-closed in the peer->guest direction
>  	 *      With no data coming from the peer, we might not get events which
> @@ -1069,19 +1076,36 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
>  	 *      Data goes from socket to socket, with nothing meaningfully "in
>  	 *      flight".
>  	 *  - Not a pseudo-local connection (e.g. to a VM on the same host)
> -	 *  - Large enough send buffer
> -	 *      In these cases, there's not enough in flight to bother.
> +	 *      If it is, there's not enough in flight to bother.
> +	 *  - Sending buffer significantly larger than bandwidth * delay product
> +	 *      Meaning we're not bandwidth-bound and this is likely to be
> +	 *      interactive traffic where we want to preserve transparent
> +	 *      connection behaviour and latency.
> +	 *
> +	 *      Otherwise, we probably want to maximise throughput, which needs
> +	 *      sending buffer auto-tuning, triggered in turn by filling up the
> +	 *      outbound socket queue.
>  	 */
> -	if (bytes_acked_cap && !force_seq &&
> +	if (bytes_acked_cap && delivery_rate_cap && !force_seq &&
>  	    !CONN_IS_CLOSING(conn) &&
> -	    !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn) &&
> -	    (unsigned)SNDBUF_GET(conn) >= SNDBUF_SMALL) {
> +	    !(conn->flags & LOCAL) && !tcp_rtt_dst_low(conn)) {
>  		if (!tinfo) {
>  			tinfo = &tinfo_new;
>  			if (getsockopt(s, SOL_TCP, TCP_INFO, tinfo, &sl))
>  				return 0;
>  		}
>  
> +		if ((unsigned)SNDBUF_GET(conn) > (long long)tinfo->tcpi_rtt *
> +						 tinfo->tcpi_delivery_rate /
> +						 1000 / 1000 *
> +						 SNDBUF_TO_BW_DELAY_INTERACTIVE)
> +			ack_everything = false;
> +	}
> +
> +	if (ack_everything) {
> +		/* Fall back to acknowledging everything we got */
> +		conn->seq_ack_to_tap = conn->seq_from_tap;
> +	} else {
>  		/* This trips a cppcheck bug in some versions, including
>  		 * cppcheck 2.18.3.
>  		 * https://sourceforge.net/p/cppcheck/discussion/general/thread/fecde59085/
> @@ -1089,9 +1113,6 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
>  		/* cppcheck-suppress [uninitvar,unmatchedSuppression] */
>  		conn->seq_ack_to_tap = tinfo->tcpi_bytes_acked +
>  		                       conn->seq_init_from_tap;
> -	} else {
> -		/* Fall back to acknowledging everything we got */
> -		conn->seq_ack_to_tap = conn->seq_from_tap;
>  	}
>  
>  	/* It's occasionally possible for us to go from using the fallback above
> -- 
> 2.43.0
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson