[PATCH v1] tcp: Handle errors from tcp_send

public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed

* [PATCH v1] tcp: Handle errors from tcp_send_flag()
@ 2026-04-10  7:55 Anshu Kumari
  2026-04-13  6:22 ` David Gibson
  2026-04-15 19:38 ` Stefano Brivio
  0 siblings, 2 replies; 6+ messages in thread
From: Anshu Kumari @ 2026-04-10  7:55 UTC (permalink / raw)
  To: passt-dev, anskuma, sbrivio; +Cc: david, lvivier

tcp_send_flag() can return error codes from tcp_prepare_flags()
failing TCP_INFO, or from failure to collect buffers on the
vhost-user path. These errors indicate the connection requires
resetting.

Most callers of tcp_send_flag() were ignoring the error code and
carrying on as if nothing was wrong. Check the return value at
each call site and handle the error appropriately:
  - in tcp_data_from_tap(), return -1 so the caller resets
  - in tcp_tap_handler(), goto reset
  - in tcp_timer_handler()/tcp_sock_handler()/tcp_conn_from_sock_finish(),
    call tcp_rst() and return
  - in tcp_tap_conn_from_sock(), set CLOSING flag (flow not yet active)
  - in tcp_keepalive(), call tcp_rst() and continue the loop
  - in tcp_flow_migrate_target_ext(), goto fail

The call in tcp_rst_do() is left unchecked: we are already
resetting, and tcp_sock_rst() still needs to run regardless.

Bug: https://bugs.passt.top/show_bug.cgi?id=194
Signed-off-by: Anshu Kumari <anskuma@redhat.com>
---
 tcp.c | 59 ++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 15 deletions(-)

diff --git a/tcp.c b/tcp.c
index 8ea9be8..9ce671a 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1917,7 +1917,9 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
 				   "keep-alive sequence: %u, previous: %u",
 				   seq, conn->seq_from_tap);
 
-			tcp_send_flag(c, conn, ACK);
+			if (tcp_send_flag(c, conn, ACK))
+				return -1;
+
 			tcp_timer_ctl(c, conn);
 
 			if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
@@ -2043,14 +2045,16 @@ eintr:
 			 *   Then swiftly looked away and left.
 			 */
 			conn->seq_from_tap = seq_from_tap;
-			tcp_send_flag(c, conn, ACK);
+			if (tcp_send_flag(c, conn, ACK))
+				return -1;
 		}
 
 		if (errno == EINTR)
 			goto eintr;
 
 		if (errno == EAGAIN || errno == EWOULDBLOCK) {
-			tcp_send_flag(c, conn, ACK | DUP_ACK);
+			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
+				return -1;
 			return p->count - idx;
 
 		}
@@ -2070,7 +2074,8 @@ out:
 		 */
 		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
 			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
-			tcp_send_flag(c, conn, ACK | DUP_ACK);
+			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
+				return -1;
 		}
 		return p->count - idx;
 	}
@@ -2084,7 +2089,8 @@ out:
 
 		conn_event(c, conn, TAP_FIN_RCVD);
 	} else {
-		tcp_send_flag(c, conn, ACK_IF_NEEDED);
+		if (tcp_send_flag(c, conn, ACK_IF_NEEDED))
+			return -1;
 	}
 
 	return p->count - idx;
@@ -2122,7 +2128,10 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
 		return;
 	}
 
-	tcp_send_flag(c, conn, ACK);
+	if (tcp_send_flag(c, conn, ACK)) {
+		tcp_rst(c, conn);
+		return;
+	}
 
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
@@ -2308,7 +2317,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 				goto reset;
 			}
 
-			tcp_send_flag(c, conn, ACK);
+			if (tcp_send_flag(c, conn, ACK))
+				goto reset;
+
 			conn_event(c, conn, SOCK_FIN_SENT);
 
 			return 1;
@@ -2388,7 +2399,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		}
 
 		conn_event(c, conn, SOCK_FIN_SENT);
-		tcp_send_flag(c, conn, ACK);
+		if (tcp_send_flag(c, conn, ACK))
+			goto reset;
+
 		ack_due = 0;
 
 		/* If we received a FIN, but the socket is in TCP_ESTABLISHED
@@ -2478,7 +2491,11 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
 
 	conn->wnd_from_tap = WINDOW_DEFAULT;
 
-	tcp_send_flag(c, conn, SYN);
+	if (tcp_send_flag(c, conn, SYN)) {
+		conn_flag(c, conn, CLOSING);
+		return;
+	}
+
 	conn_flag(c, conn, ACK_FROM_TAP_DUE);
 
 	tcp_get_sndbuf(conn);
@@ -2585,7 +2602,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 		return;
 
 	if (conn->flags & ACK_TO_TAP_DUE) {
-		tcp_send_flag(c, conn, ACK_IF_NEEDED);
+		if (tcp_send_flag(c, conn, ACK_IF_NEEDED)) {
+			tcp_rst(c, conn);
+			return;
+		}
 		tcp_timer_ctl(c, conn);
 	} else if (conn->flags & ACK_FROM_TAP_DUE) {
 		if (!(conn->events & ESTABLISHED)) {
@@ -2598,7 +2618,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
 				tcp_rst(c, conn);
 			} else {
 				flow_trace(conn, "SYN timeout, retry");
-				tcp_send_flag(c, conn, SYN);
+				if (tcp_send_flag(c, conn, SYN)) {
+					tcp_rst(c, conn);
+					return;
+				}
 				conn->retries++;
 				conn_flag(c, conn, SYN_RETRIED);
 				tcp_timer_ctl(c, conn);
@@ -2662,8 +2685,11 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
 			tcp_data_from_sock(c, conn);
 
 		if (events & EPOLLOUT) {
-			if (tcp_update_seqack_wnd(c, conn, false, NULL))
-				tcp_send_flag(c, conn, ACK);
+			if (tcp_update_seqack_wnd(c, conn, false, NULL) &&
+			    tcp_send_flag(c, conn, ACK)) {
+				tcp_rst(c, conn);
+				return;
+			}
 		}
 
 		return;
@@ -2903,7 +2929,8 @@ static void tcp_keepalive(struct ctx *c, const struct timespec *now)
 		if (conn->tap_inactive) {
 			flow_dbg(conn, "No tap activity for least %us, send keepalive",
 				 KEEPALIVE_INTERVAL);
-			tcp_send_flag(c, conn, KEEPALIVE);
+			if (tcp_send_flag(c, conn, KEEPALIVE))
+				tcp_rst(c, conn);
 		}
 
 		/* Ready to check fot next interval */
@@ -3926,7 +3953,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
 	if (tcp_set_peek_offset(conn, peek_offset))
 		goto fail;
 
-	tcp_send_flag(c, conn, ACK);
+	if (tcp_send_flag(c, conn, ACK))
+		goto fail;
+
 	tcp_data_from_sock(c, conn);
 
 	if ((rc = tcp_epoll_ctl(conn))) {
-- 
2.53.0


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v1] tcp: Handle errors from tcp_send_flag()
  2026-04-10  7:55 [PATCH v1] tcp: Handle errors from tcp_send_flag() Anshu Kumari
@ 2026-04-13  6:22 ` David Gibson
  2026-04-15 19:38 ` Stefano Brivio
  1 sibling, 0 replies; 6+ messages in thread
From: David Gibson @ 2026-04-13  6:22 UTC (permalink / raw)
  To: Anshu Kumari; +Cc: passt-dev, sbrivio, lvivier

[-- Attachment #1: Type: text/plain, Size: 6533 bytes --]

On Fri, Apr 10, 2026 at 01:25:39PM +0530, Anshu Kumari wrote:
> tcp_send_flag() can return error codes from tcp_prepare_flags()
> failing TCP_INFO, or from failure to collect buffers on the
> vhost-user path. These errors indicate the connection requires
> resetting.
> 
> Most callers of tcp_send_flag() were ignoring the error code and
> carrying on as if nothing was wrong. Check the return value at
> each call site and handle the error appropriately:
>   - in tcp_data_from_tap(), return -1 so the caller resets
>   - in tcp_tap_handler(), goto reset
>   - in tcp_timer_handler()/tcp_sock_handler()/tcp_conn_from_sock_finish(),
>     call tcp_rst() and return
>   - in tcp_tap_conn_from_sock(), set CLOSING flag (flow not yet active)
>   - in tcp_keepalive(), call tcp_rst() and continue the loop
>   - in tcp_flow_migrate_target_ext(), goto fail

Thanks for this detailed summary, it really helps a reviewer follow
the logic.

> The call in tcp_rst_do() is left unchecked: we are already
> resetting, and tcp_sock_rst() still needs to run regardless.
> 
> Bug: https://bugs.passt.top/show_bug.cgi?id=194
> Signed-off-by: Anshu Kumari <anskuma@redhat.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
>  tcp.c | 59 ++++++++++++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 44 insertions(+), 15 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 8ea9be8..9ce671a 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1917,7 +1917,9 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  				   "keep-alive sequence: %u, previous: %u",
>  				   seq, conn->seq_from_tap);
>  
> -			tcp_send_flag(c, conn, ACK);
> +			if (tcp_send_flag(c, conn, ACK))
> +				return -1;
> +
>  			tcp_timer_ctl(c, conn);
>  
>  			if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
> @@ -2043,14 +2045,16 @@ eintr:
>  			 *   Then swiftly looked away and left.
>  			 */
>  			conn->seq_from_tap = seq_from_tap;
> -			tcp_send_flag(c, conn, ACK);
> +			if (tcp_send_flag(c, conn, ACK))
> +				return -1;
>  		}
>  
>  		if (errno == EINTR)
>  			goto eintr;
>  
>  		if (errno == EAGAIN || errno == EWOULDBLOCK) {
> -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> +				return -1;
>  			return p->count - idx;
>  
>  		}
> @@ -2070,7 +2074,8 @@ out:
>  		 */
>  		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
>  			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
> -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> +				return -1;
>  		}
>  		return p->count - idx;
>  	}
> @@ -2084,7 +2089,8 @@ out:
>  
>  		conn_event(c, conn, TAP_FIN_RCVD);
>  	} else {
> -		tcp_send_flag(c, conn, ACK_IF_NEEDED);
> +		if (tcp_send_flag(c, conn, ACK_IF_NEEDED))
> +			return -1;
>  	}
>  
>  	return p->count - idx;
> @@ -2122,7 +2128,10 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
>  		return;
>  	}
>  
> -	tcp_send_flag(c, conn, ACK);
> +	if (tcp_send_flag(c, conn, ACK)) {
> +		tcp_rst(c, conn);
> +		return;
> +	}
>  
>  	/* The client might have sent data already, which we didn't
>  	 * dequeue waiting for SYN,ACK from tap -- check now.
> @@ -2308,7 +2317,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
>  				goto reset;
>  			}
>  
> -			tcp_send_flag(c, conn, ACK);
> +			if (tcp_send_flag(c, conn, ACK))
> +				goto reset;
> +
>  			conn_event(c, conn, SOCK_FIN_SENT);
>  
>  			return 1;
> @@ -2388,7 +2399,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
>  		}
>  
>  		conn_event(c, conn, SOCK_FIN_SENT);
> -		tcp_send_flag(c, conn, ACK);
> +		if (tcp_send_flag(c, conn, ACK))
> +			goto reset;
> +
>  		ack_due = 0;
>  
>  		/* If we received a FIN, but the socket is in TCP_ESTABLISHED
> @@ -2478,7 +2491,11 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
>  
>  	conn->wnd_from_tap = WINDOW_DEFAULT;
>  
> -	tcp_send_flag(c, conn, SYN);
> +	if (tcp_send_flag(c, conn, SYN)) {
> +		conn_flag(c, conn, CLOSING);
> +		return;
> +	}
> +
>  	conn_flag(c, conn, ACK_FROM_TAP_DUE);
>  
>  	tcp_get_sndbuf(conn);
> @@ -2585,7 +2602,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
>  		return;
>  
>  	if (conn->flags & ACK_TO_TAP_DUE) {
> -		tcp_send_flag(c, conn, ACK_IF_NEEDED);
> +		if (tcp_send_flag(c, conn, ACK_IF_NEEDED)) {
> +			tcp_rst(c, conn);
> +			return;
> +		}
>  		tcp_timer_ctl(c, conn);
>  	} else if (conn->flags & ACK_FROM_TAP_DUE) {
>  		if (!(conn->events & ESTABLISHED)) {
> @@ -2598,7 +2618,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
>  				tcp_rst(c, conn);
>  			} else {
>  				flow_trace(conn, "SYN timeout, retry");
> -				tcp_send_flag(c, conn, SYN);
> +				if (tcp_send_flag(c, conn, SYN)) {
> +					tcp_rst(c, conn);
> +					return;
> +				}
>  				conn->retries++;
>  				conn_flag(c, conn, SYN_RETRIED);
>  				tcp_timer_ctl(c, conn);
> @@ -2662,8 +2685,11 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
>  			tcp_data_from_sock(c, conn);
>  
>  		if (events & EPOLLOUT) {
> -			if (tcp_update_seqack_wnd(c, conn, false, NULL))
> -				tcp_send_flag(c, conn, ACK);
> +			if (tcp_update_seqack_wnd(c, conn, false, NULL) &&
> +			    tcp_send_flag(c, conn, ACK)) {
> +				tcp_rst(c, conn);
> +				return;
> +			}
>  		}
>  
>  		return;
> @@ -2903,7 +2929,8 @@ static void tcp_keepalive(struct ctx *c, const struct timespec *now)
>  		if (conn->tap_inactive) {
>  			flow_dbg(conn, "No tap activity for least %us, send keepalive",
>  				 KEEPALIVE_INTERVAL);
> -			tcp_send_flag(c, conn, KEEPALIVE);
> +			if (tcp_send_flag(c, conn, KEEPALIVE))
> +				tcp_rst(c, conn);
>  		}
>  
>  		/* Ready to check fot next interval */
> @@ -3926,7 +3953,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
>  	if (tcp_set_peek_offset(conn, peek_offset))
>  		goto fail;
>  
> -	tcp_send_flag(c, conn, ACK);
> +	if (tcp_send_flag(c, conn, ACK))
> +		goto fail;
> +
>  	tcp_data_from_sock(c, conn);
>  
>  	if ((rc = tcp_epoll_ctl(conn))) {
> -- 
> 2.53.0
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v1] tcp: Handle errors from tcp_send_flag()
  2026-04-10  7:55 [PATCH v1] tcp: Handle errors from tcp_send_flag() Anshu Kumari
  2026-04-13  6:22 ` David Gibson
@ 2026-04-15 19:38 ` Stefano Brivio
  2026-04-20  2:47   ` David Gibson
  1 sibling, 1 reply; 6+ messages in thread
From: Stefano Brivio @ 2026-04-15 19:38 UTC (permalink / raw)
  To: Anshu Kumari, David Gibson; +Cc: passt-dev, Laurent Vivier

Nit: v1 in the subject tag is not necessary (not harmful either): if
there are no version tags, it's implicit we're talking about version 1.

On Fri, 10 Apr 2026 13:25:39 +0530
Anshu Kumari <anskuma@redhat.com> wrote:

> tcp_send_flag() can return error codes from tcp_prepare_flags()
> failing TCP_INFO, or from failure to collect buffers on the
> vhost-user path. These errors indicate the connection requires
> resetting.
> 
> Most callers of tcp_send_flag() were ignoring the error code and
> carrying on as if nothing was wrong. Check the return value at
> each call site and handle the error appropriately:
>   - in tcp_data_from_tap(), return -1 so the caller resets
>   - in tcp_tap_handler(), goto reset
>   - in tcp_timer_handler()/tcp_sock_handler()/tcp_conn_from_sock_finish(),
>     call tcp_rst() and return
>   - in tcp_tap_conn_from_sock(), set CLOSING flag (flow not yet active)
>   - in tcp_keepalive(), call tcp_rst() and continue the loop
>   - in tcp_flow_migrate_target_ext(), goto fail
> 
> The call in tcp_rst_do() is left unchecked: we are already
> resetting, and tcp_sock_rst() still needs to run regardless.
> 
> Bug: https://bugs.passt.top/show_bug.cgi?id=194

Nit: we always use Link: tags (CONTRIBUTING.md uses the plural which
might be a bit confusing, I guess we should fix that), rationale:

  https://archives.passt.top/passt-dev/20230704132104.48106368@elisabeth/
  https://archives.passt.top/passt-dev/20251105163137.424a6537@elisabeth/

But I fix up these tags on merge anyway, no need to re-send (in
general).

> Signed-off-by: Anshu Kumari <anskuma@redhat.com>
> ---
>  tcp.c | 59 ++++++++++++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 44 insertions(+), 15 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 8ea9be8..9ce671a 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1917,7 +1917,9 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
>  				   "keep-alive sequence: %u, previous: %u",
>  				   seq, conn->seq_from_tap);
>  
> -			tcp_send_flag(c, conn, ACK);
> +			if (tcp_send_flag(c, conn, ACK))
> +				return -1;

A general comment: in _some_ of these cases where we fail to send ACK
segments, I intentionally didn't check for errors and let the
connection live on, because that looked like the most graceful failure
handling to me.

After all, ACK segments without data are not assumed to be reliably
transmitted (RFC 9293, 3.8.4), so, given that failing to send some
should have a similar outcome as the peer missing some, I guess we're
always expected to recover from a situation like that.

This doesn't apply to other occurrences below where we fail to send a
SYN segment or where failure to send ACK segments might mean we are in
some expected state (including a connection that might get stuck
forever).

But reading David's description of bug #194, I wonder if he had
something else in mind. That is, I don't have a strong preference
against resetting the connection whenever we fail to prepare buffers,
but in many of these cases we don't really _have to_ reset the
connection. David, do you see this differently?

> +
>  			tcp_timer_ctl(c, conn);
>  
>  			if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
> @@ -2043,14 +2045,16 @@ eintr:
>  			 *   Then swiftly looked away and left.
>  			 */
>  			conn->seq_from_tap = seq_from_tap;
> -			tcp_send_flag(c, conn, ACK);
> +			if (tcp_send_flag(c, conn, ACK))
> +				return -1;
>  		}
>  
>  		if (errno == EINTR)
>  			goto eintr;
>  
>  		if (errno == EAGAIN || errno == EWOULDBLOCK) {
> -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> +				return -1;
>  			return p->count - idx;
>  
>  		}
> @@ -2070,7 +2074,8 @@ out:
>  		 */
>  		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
>  			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
> -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> +				return -1;
>  		}
>  		return p->count - idx;
>  	}
> @@ -2084,7 +2089,8 @@ out:
>  
>  		conn_event(c, conn, TAP_FIN_RCVD);
>  	} else {
> -		tcp_send_flag(c, conn, ACK_IF_NEEDED);
> +		if (tcp_send_flag(c, conn, ACK_IF_NEEDED))
> +			return -1;
>  	}
>  
>  	return p->count - idx;
> @@ -2122,7 +2128,10 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
>  		return;
>  	}
>  
> -	tcp_send_flag(c, conn, ACK);
> +	if (tcp_send_flag(c, conn, ACK)) {
> +		tcp_rst(c, conn);
> +		return;
> +	}
>  
>  	/* The client might have sent data already, which we didn't
>  	 * dequeue waiting for SYN,ACK from tap -- check now.
> @@ -2308,7 +2317,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
>  				goto reset;
>  			}
>  
> -			tcp_send_flag(c, conn, ACK);
> +			if (tcp_send_flag(c, conn, ACK))
> +				goto reset;
> +
>  			conn_event(c, conn, SOCK_FIN_SENT);
>  
>  			return 1;
> @@ -2388,7 +2399,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
>  		}
>  
>  		conn_event(c, conn, SOCK_FIN_SENT);
> -		tcp_send_flag(c, conn, ACK);
> +		if (tcp_send_flag(c, conn, ACK))
> +			goto reset;
> +
>  		ack_due = 0;
>  
>  		/* If we received a FIN, but the socket is in TCP_ESTABLISHED
> @@ -2478,7 +2491,11 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
>  
>  	conn->wnd_from_tap = WINDOW_DEFAULT;
>  
> -	tcp_send_flag(c, conn, SYN);
> +	if (tcp_send_flag(c, conn, SYN)) {
> +		conn_flag(c, conn, CLOSING);

I would wait for David to confirm, but I'm fairly sure that this needs
FLOW_ACTIVATE(conn); before returning, just like in the other error path
of this function, because otherwise we'll leave the newly created flow
in an "incomplete" state.

Due to flow table restrictions we adopted to keep the implementation
simple (see "Theory of Operation - allocating and freeing flow entries"
in flow.c), quoting from the documentation to enum flow_state in
flow.h:

 *        Caveats:
 *            - At most one entry may be NEW, INI, TGT or TYPED at a time, so
 *              it's unsafe to use flow_alloc() again until this entry moves to
 *              ACTIVE or FREE

so, if we create a second connection within the same epoll cycle (for
example by calling tcp_tap_conn_from_sock() again), we'll now have two
entries in state TYPED, which breaks this assumption, and things will

David, I think this isn't documented very obviously, even though it's
all there in flow.h. This just occurred to me because of commit
52419a64f2df ("migrate, tcp: Don't flow_alloc_cancel() during incoming
migration") but we can't expect others to know about past commits.

I wonder if you could think of a quick way to make this more prominent...
should we perhaps state return conditions in functions, like you already
added for isolation.c?

> +		return;
> +	}
> +
>  	conn_flag(c, conn, ACK_FROM_TAP_DUE);
>  
>  	tcp_get_sndbuf(conn);
> @@ -2585,7 +2602,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
>  		return;
>  
>  	if (conn->flags & ACK_TO_TAP_DUE) {
> -		tcp_send_flag(c, conn, ACK_IF_NEEDED);
> +		if (tcp_send_flag(c, conn, ACK_IF_NEEDED)) {
> +			tcp_rst(c, conn);
> +			return;
> +		}
>  		tcp_timer_ctl(c, conn);
>  	} else if (conn->flags & ACK_FROM_TAP_DUE) {
>  		if (!(conn->events & ESTABLISHED)) {
> @@ -2598,7 +2618,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
>  				tcp_rst(c, conn);
>  			} else {
>  				flow_trace(conn, "SYN timeout, retry");
> -				tcp_send_flag(c, conn, SYN);
> +				if (tcp_send_flag(c, conn, SYN)) {
> +					tcp_rst(c, conn);
> +					return;
> +				}
>  				conn->retries++;
>  				conn_flag(c, conn, SYN_RETRIED);
>  				tcp_timer_ctl(c, conn);
> @@ -2662,8 +2685,11 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
>  			tcp_data_from_sock(c, conn);
>  
>  		if (events & EPOLLOUT) {
> -			if (tcp_update_seqack_wnd(c, conn, false, NULL))
> -				tcp_send_flag(c, conn, ACK);
> +			if (tcp_update_seqack_wnd(c, conn, false, NULL) &&
> +			    tcp_send_flag(c, conn, ACK)) {
> +				tcp_rst(c, conn);
> +				return;
> +			}
>  		}
>  
>  		return;
> @@ -2903,7 +2929,8 @@ static void tcp_keepalive(struct ctx *c, const struct timespec *now)
>  		if (conn->tap_inactive) {
>  			flow_dbg(conn, "No tap activity for least %us, send keepalive",
>  				 KEEPALIVE_INTERVAL);
> -			tcp_send_flag(c, conn, KEEPALIVE);
> +			if (tcp_send_flag(c, conn, KEEPALIVE))
> +				tcp_rst(c, conn);
>  		}
>  
>  		/* Ready to check fot next interval */
> @@ -3926,7 +3953,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
>  	if (tcp_set_peek_offset(conn, peek_offset))
>  		goto fail;
>  
> -	tcp_send_flag(c, conn, ACK);
> +	if (tcp_send_flag(c, conn, ACK))
> +		goto fail;
> +
>  	tcp_data_from_sock(c, conn);
>  
>  	if ((rc = tcp_epoll_ctl(conn))) {

-- 
Stefano


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v1] tcp: Handle errors from tcp_send_flag()
  2026-04-15 19:38 ` Stefano Brivio
@ 2026-04-20  2:47   ` David Gibson
  2026-04-20 23:41     ` Stefano Brivio
  0 siblings, 1 reply; 6+ messages in thread
From: David Gibson @ 2026-04-20  2:47 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Anshu Kumari, passt-dev, Laurent Vivier

[-- Attachment #1: Type: text/plain, Size: 12490 bytes --]

On Wed, Apr 15, 2026 at 09:38:28PM +0200, Stefano Brivio wrote:
> Nit: v1 in the subject tag is not necessary (not harmful either): if
> there are no version tags, it's implicit we're talking about version 1.
> 
> On Fri, 10 Apr 2026 13:25:39 +0530
> Anshu Kumari <anskuma@redhat.com> wrote:
> 
> > tcp_send_flag() can return error codes from tcp_prepare_flags()
> > failing TCP_INFO, or from failure to collect buffers on the
> > vhost-user path. These errors indicate the connection requires
> > resetting.
> > 
> > Most callers of tcp_send_flag() were ignoring the error code and
> > carrying on as if nothing was wrong. Check the return value at
> > each call site and handle the error appropriately:
> >   - in tcp_data_from_tap(), return -1 so the caller resets
> >   - in tcp_tap_handler(), goto reset
> >   - in tcp_timer_handler()/tcp_sock_handler()/tcp_conn_from_sock_finish(),
> >     call tcp_rst() and return
> >   - in tcp_tap_conn_from_sock(), set CLOSING flag (flow not yet active)
> >   - in tcp_keepalive(), call tcp_rst() and continue the loop
> >   - in tcp_flow_migrate_target_ext(), goto fail
> > 
> > The call in tcp_rst_do() is left unchecked: we are already
> > resetting, and tcp_sock_rst() still needs to run regardless.
> > 
> > Bug: https://bugs.passt.top/show_bug.cgi?id=194
> 
> Nit: we always use Link: tags (CONTRIBUTING.md uses the plural which
> might be a bit confusing, I guess we should fix that), rationale:
> 
>   https://archives.passt.top/passt-dev/20230704132104.48106368@elisabeth/
>   https://archives.passt.top/passt-dev/20251105163137.424a6537@elisabeth/
> 
> But I fix up these tags on merge anyway, no need to re-send (in
> general).
> 
> > Signed-off-by: Anshu Kumari <anskuma@redhat.com>
> > ---
> >  tcp.c | 59 ++++++++++++++++++++++++++++++++++++++++++++---------------
> >  1 file changed, 44 insertions(+), 15 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 8ea9be8..9ce671a 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -1917,7 +1917,9 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> >  				   "keep-alive sequence: %u, previous: %u",
> >  				   seq, conn->seq_from_tap);
> >  
> > -			tcp_send_flag(c, conn, ACK);
> > +			if (tcp_send_flag(c, conn, ACK))
> > +				return -1;
> 
> A general comment: in _some_ of these cases where we fail to send ACK
> segments, I intentionally didn't check for errors and let the
> connection live on, because that looked like the most graceful failure
> handling to me.
> 
> After all, ACK segments without data are not assumed to be reliably
> transmitted (RFC 9293, 3.8.4), so, given that failing to send some
> should have a similar outcome as the peer missing some, I guess we're
> always expected to recover from a situation like that.
> 
> This doesn't apply to other occurrences below where we fail to send a
> SYN segment or where failure to send ACK segments might mean we are in
> some expected state (including a connection that might get stuck
> forever).
> 
> But reading David's description of bug #194, I wonder if he had
> something else in mind. That is, I don't have a strong preference
> against resetting the connection whenever we fail to prepare buffers,
> but in many of these cases we don't really _have to_ reset the
> connection. David, do you see this differently?

That's a good point, which I didn't think about when I reviewed.  I
don't recall if I'd considered it when I filed the bug.

So, in principle, you're right; failing to send an ack is equivalent
to losing an ack en route, which generally shouldn't cause an
immediate reset of the connection in most cases.

But... that's complicated by the details of what errors can actually
occur here.  AFAICT there are only three cases:

 1) tcp_vu_send_flag() returns -1 because vu_collect() returned 0

IIUC this means we ran out of buffers.  We probably shouldn't reset in
this case.

 2) tcp_vu_send_flag() returns -1 because vu_collect() returned > 1

This means the guest buffer layout wasn't what we expected.  We
probably shouldn't reset in this case, but I believe it will go away
with Laurent's pending multi-buffer vu patches anyway.  So we can
probably ignore this one.

[Also, we according to docs and other usage we should return an errno
 in both these cases, not a bare -1]

 3) tcp_prepare_flags() returns -ECONNRESET

This only happens if TCP_INFO fails.  In this case we *should* reset -
in fact we already set CLOSED in the connection events.  Essentially,
we've concluded the socket side is irretrievably broken, so we should
shut down the tap side as well.

So I think we have two options here:

A) tcp_send_flag() reports only "fatal" errors

Simpler to do, but possibly confusing.  We'd need to

    A1) Change tcp_vu_send_flag() not to report an error if
        vu_collect() returns 0.  AIUI, this case is more or less
	equivalent to filling the tap socket queue, and we don't
	report that case as an error in tcp_buf_send_flag().
	tcp_payload_flush() reverts sequences in this case, but
	doesn't report the error any further.

    A2) That leaves only the TCP_INFO failure case, so we can and
        should reset on any failure from tcp_send_flag(), just like
	this draft

B) tcp_send_flag() reports both transient and fatal errors

More complex, but maybe less surprising semantics?  tcp_send_flag()
would need to use different return codes for the different cases
(maybe ECONNRESET vs. EBUSY?)

    B1) For consistency we should propagate failures from
        tcp_payload_flush(), through tcp_buf_send_flag() to its
	callers,
    B2) We should only reset if tcp_send_flag() returns -ECONNRESET

> > +
> >  			tcp_timer_ctl(c, conn);
> >  
> >  			if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
> > @@ -2043,14 +2045,16 @@ eintr:
> >  			 *   Then swiftly looked away and left.
> >  			 */
> >  			conn->seq_from_tap = seq_from_tap;
> > -			tcp_send_flag(c, conn, ACK);
> > +			if (tcp_send_flag(c, conn, ACK))
> > +				return -1;
> >  		}
> >  
> >  		if (errno == EINTR)
> >  			goto eintr;
> >  
> >  		if (errno == EAGAIN || errno == EWOULDBLOCK) {
> > -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> > +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> > +				return -1;
> >  			return p->count - idx;
> >  
> >  		}
> > @@ -2070,7 +2074,8 @@ out:
> >  		 */
> >  		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
> >  			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
> > -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> > +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> > +				return -1;
> >  		}
> >  		return p->count - idx;
> >  	}
> > @@ -2084,7 +2089,8 @@ out:
> >  
> >  		conn_event(c, conn, TAP_FIN_RCVD);
> >  	} else {
> > -		tcp_send_flag(c, conn, ACK_IF_NEEDED);
> > +		if (tcp_send_flag(c, conn, ACK_IF_NEEDED))
> > +			return -1;
> >  	}
> >  
> >  	return p->count - idx;
> > @@ -2122,7 +2128,10 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
> >  		return;
> >  	}
> >  
> > -	tcp_send_flag(c, conn, ACK);
> > +	if (tcp_send_flag(c, conn, ACK)) {
> > +		tcp_rst(c, conn);
> > +		return;
> > +	}
> >  
> >  	/* The client might have sent data already, which we didn't
> >  	 * dequeue waiting for SYN,ACK from tap -- check now.
> > @@ -2308,7 +2317,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
> >  				goto reset;
> >  			}
> >  
> > -			tcp_send_flag(c, conn, ACK);
> > +			if (tcp_send_flag(c, conn, ACK))
> > +				goto reset;
> > +
> >  			conn_event(c, conn, SOCK_FIN_SENT);
> >  
> >  			return 1;
> > @@ -2388,7 +2399,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
> >  		}
> >  
> >  		conn_event(c, conn, SOCK_FIN_SENT);
> > -		tcp_send_flag(c, conn, ACK);
> > +		if (tcp_send_flag(c, conn, ACK))
> > +			goto reset;
> > +
> >  		ack_due = 0;
> >  
> >  		/* If we received a FIN, but the socket is in TCP_ESTABLISHED
> > @@ -2478,7 +2491,11 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
> >  
> >  	conn->wnd_from_tap = WINDOW_DEFAULT;
> >  
> > -	tcp_send_flag(c, conn, SYN);
> > +	if (tcp_send_flag(c, conn, SYN)) {
> > +		conn_flag(c, conn, CLOSING);
> 
> I would wait for David to confirm, but I'm fairly sure that this needs
> FLOW_ACTIVATE(conn); before returning, just like in the other error path
> of this function, because otherwise we'll leave the newly created flow
> in an "incomplete" state.

Ah, yes, I missed that.

> Due to flow table restrictions we adopted to keep the implementation
> simple (see "Theory of Operation - allocating and freeing flow entries"
> in flow.c), quoting from the documentation to enum flow_state in
> flow.h:
> 
>  *        Caveats:
>  *            - At most one entry may be NEW, INI, TGT or TYPED at a time, so
>  *              it's unsafe to use flow_alloc() again until this entry moves to
>  *              ACTIVE or FREE
> 
> so, if we create a second connection within the same epoll cycle (for
> example by calling tcp_tap_conn_from_sock() again), we'll now have two
> entries in state TYPED, which breaks this assumption, and things will

Exactly right.

> David, I think this isn't documented very obviously, even though it's
> all there in flow.h. This just occurred to me because of commit
> 52419a64f2df ("migrate, tcp: Don't flow_alloc_cancel() during incoming
> migration") but we can't expect others to know about past commits.

I agree it's not as clear as I'd like...

> I wonder if you could think of a quick way to make this more prominent...
> should we perhaps state return conditions in functions, like you already
> added for isolation.c?

... but I'm not really sure how to make it more prominent in a useful
way.  Maybe a note in the function header would do the trick?  I'm not
sure.

> 
> > +		return;
> > +	}
> > +
> >  	conn_flag(c, conn, ACK_FROM_TAP_DUE);
> >  
> >  	tcp_get_sndbuf(conn);
> > @@ -2585,7 +2602,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
> >  		return;
> >  
> >  	if (conn->flags & ACK_TO_TAP_DUE) {
> > -		tcp_send_flag(c, conn, ACK_IF_NEEDED);
> > +		if (tcp_send_flag(c, conn, ACK_IF_NEEDED)) {
> > +			tcp_rst(c, conn);
> > +			return;
> > +		}
> >  		tcp_timer_ctl(c, conn);
> >  	} else if (conn->flags & ACK_FROM_TAP_DUE) {
> >  		if (!(conn->events & ESTABLISHED)) {
> > @@ -2598,7 +2618,10 @@ void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
> >  				tcp_rst(c, conn);
> >  			} else {
> >  				flow_trace(conn, "SYN timeout, retry");
> > -				tcp_send_flag(c, conn, SYN);
> > +				if (tcp_send_flag(c, conn, SYN)) {
> > +					tcp_rst(c, conn);
> > +					return;
> > +				}
> >  				conn->retries++;
> >  				conn_flag(c, conn, SYN_RETRIED);
> >  				tcp_timer_ctl(c, conn);
> > @@ -2662,8 +2685,11 @@ void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
> >  			tcp_data_from_sock(c, conn);
> >  
> >  		if (events & EPOLLOUT) {
> > -			if (tcp_update_seqack_wnd(c, conn, false, NULL))
> > -				tcp_send_flag(c, conn, ACK);
> > +			if (tcp_update_seqack_wnd(c, conn, false, NULL) &&
> > +			    tcp_send_flag(c, conn, ACK)) {
> > +				tcp_rst(c, conn);
> > +				return;
> > +			}
> >  		}
> >  
> >  		return;
> > @@ -2903,7 +2929,8 @@ static void tcp_keepalive(struct ctx *c, const struct timespec *now)
> >  		if (conn->tap_inactive) {
> >  			flow_dbg(conn, "No tap activity for least %us, send keepalive",
> >  				 KEEPALIVE_INTERVAL);
> > -			tcp_send_flag(c, conn, KEEPALIVE);
> > +			if (tcp_send_flag(c, conn, KEEPALIVE))
> > +				tcp_rst(c, conn);
> >  		}
> >  
> >  		/* Ready to check fot next interval */
> > @@ -3926,7 +3953,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd
> >  	if (tcp_set_peek_offset(conn, peek_offset))
> >  		goto fail;
> >  
> > -	tcp_send_flag(c, conn, ACK);
> > +	if (tcp_send_flag(c, conn, ACK))
> > +		goto fail;
> > +
> >  	tcp_data_from_sock(c, conn);
> >  
> >  	if ((rc = tcp_epoll_ctl(conn))) {
> 
> -- 
> Stefano
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v1] tcp: Handle errors from tcp_send_flag()
  2026-04-20  2:47   ` David Gibson
@ 2026-04-20 23:41     ` Stefano Brivio
  2026-04-21  4:23       ` David Gibson
  0 siblings, 1 reply; 6+ messages in thread
From: Stefano Brivio @ 2026-04-20 23:41 UTC (permalink / raw)
  To: David Gibson; +Cc: Anshu Kumari, passt-dev, Laurent Vivier

On Mon, 20 Apr 2026 12:47:58 +1000
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Wed, Apr 15, 2026 at 09:38:28PM +0200, Stefano Brivio wrote:
> > Nit: v1 in the subject tag is not necessary (not harmful either): if
> > there are no version tags, it's implicit we're talking about version 1.
> > 
> > On Fri, 10 Apr 2026 13:25:39 +0530
> > Anshu Kumari <anskuma@redhat.com> wrote:
> >   
> > > tcp_send_flag() can return error codes from tcp_prepare_flags()
> > > failing TCP_INFO, or from failure to collect buffers on the
> > > vhost-user path. These errors indicate the connection requires
> > > resetting.
> > > 
> > > Most callers of tcp_send_flag() were ignoring the error code and
> > > carrying on as if nothing was wrong. Check the return value at
> > > each call site and handle the error appropriately:
> > >   - in tcp_data_from_tap(), return -1 so the caller resets
> > >   - in tcp_tap_handler(), goto reset
> > >   - in tcp_timer_handler()/tcp_sock_handler()/tcp_conn_from_sock_finish(),
> > >     call tcp_rst() and return
> > >   - in tcp_tap_conn_from_sock(), set CLOSING flag (flow not yet active)
> > >   - in tcp_keepalive(), call tcp_rst() and continue the loop
> > >   - in tcp_flow_migrate_target_ext(), goto fail
> > > 
> > > The call in tcp_rst_do() is left unchecked: we are already
> > > resetting, and tcp_sock_rst() still needs to run regardless.
> > > 
> > > Bug: https://bugs.passt.top/show_bug.cgi?id=194  
> > 
> > Nit: we always use Link: tags (CONTRIBUTING.md uses the plural which
> > might be a bit confusing, I guess we should fix that), rationale:
> > 
> >   https://archives.passt.top/passt-dev/20230704132104.48106368@elisabeth/
> >   https://archives.passt.top/passt-dev/20251105163137.424a6537@elisabeth/
> > 
> > But I fix up these tags on merge anyway, no need to re-send (in
> > general).
> >   
> > > Signed-off-by: Anshu Kumari <anskuma@redhat.com>
> > > ---
> > >  tcp.c | 59 ++++++++++++++++++++++++++++++++++++++++++++---------------
> > >  1 file changed, 44 insertions(+), 15 deletions(-)
> > > 
> > > diff --git a/tcp.c b/tcp.c
> > > index 8ea9be8..9ce671a 100644
> > > --- a/tcp.c
> > > +++ b/tcp.c
> > > @@ -1917,7 +1917,9 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> > >  				   "keep-alive sequence: %u, previous: %u",
> > >  				   seq, conn->seq_from_tap);
> > >  
> > > -			tcp_send_flag(c, conn, ACK);
> > > +			if (tcp_send_flag(c, conn, ACK))
> > > +				return -1;  
> > 
> > A general comment: in _some_ of these cases where we fail to send ACK
> > segments, I intentionally didn't check for errors and let the
> > connection live on, because that looked like the most graceful failure
> > handling to me.
> > 
> > After all, ACK segments without data are not assumed to be reliably
> > transmitted (RFC 9293, 3.8.4), so, given that failing to send some
> > should have a similar outcome as the peer missing some, I guess we're
> > always expected to recover from a situation like that.
> > 
> > This doesn't apply to other occurrences below where we fail to send a
> > SYN segment or where failure to send ACK segments might mean we are in
> > some expected state (including a connection that might get stuck
> > forever).
> > 
> > But reading David's description of bug #194, I wonder if he had
> > something else in mind. That is, I don't have a strong preference
> > against resetting the connection whenever we fail to prepare buffers,
> > but in many of these cases we don't really _have to_ reset the
> > connection. David, do you see this differently?  
> 
> That's a good point, which I didn't think about when I reviewed.  I
> don't recall if I'd considered it when I filed the bug.
> 
> So, in principle, you're right; failing to send an ack is equivalent
> to losing an ack en route, which generally shouldn't cause an
> immediate reset of the connection in most cases.
> 
> But... that's complicated by the details of what errors can actually
> occur here.  AFAICT there are only three cases:
> 
>  1) tcp_vu_send_flag() returns -1 because vu_collect() returned 0
> 
> IIUC this means we ran out of buffers.  We probably shouldn't reset in
> this case.

Right.

>  2) tcp_vu_send_flag() returns -1 because vu_collect() returned > 1
> 
> This means the guest buffer layout wasn't what we expected.  We
> probably shouldn't reset in this case, but I believe it will go away
> with Laurent's pending multi-buffer vu patches anyway.  So we can
> probably ignore this one.

Yes, that will go away in a bit, so we can ignore it.

> [Also, we according to docs and other usage we should return an errno
>  in both these cases, not a bare -1]
> 
>  3) tcp_prepare_flags() returns -ECONNRESET
> 
> This only happens if TCP_INFO fails.  In this case we *should* reset -
> in fact we already set CLOSED in the connection events.  Essentially,
> we've concluded the socket side is irretrievably broken, so we should
> shut down the tap side as well.

Agreed.

> So I think we have two options here:
> 
> A) tcp_send_flag() reports only "fatal" errors
> 
> Simpler to do, but possibly confusing.  We'd need to

Rather confusing for me indeed.

>     A1) Change tcp_vu_send_flag() not to report an error if
>         vu_collect() returns 0.  AIUI, this case is more or less
> 	equivalent to filling the tap socket queue, and we don't
> 	report that case as an error in tcp_buf_send_flag().
> 	tcp_payload_flush() reverts sequences in this case, but
> 	doesn't report the error any further.
> 
>     A2) That leaves only the TCP_INFO failure case, so we can and
>         should reset on any failure from tcp_send_flag(), just like
> 	this draft

This looks like a stretch and almost a layering violation to me, that
is, we would adapt the semantics of tcp_send_flag() to the caller
instead of making it return what naturally makes sense.

> B) tcp_send_flag() reports both transient and fatal errors
> 
> More complex, but maybe less surprising semantics?

Definitely less surprising I think.

> tcp_send_flag()
> would need to use different return codes for the different cases
> (maybe ECONNRESET vs. EBUSY?)

Or... if we run out of buffers, and we're sending an ACK, "try again":
-EAGAIN.

I was assuming that if we're sending a SYN that's a different case and
we should reset right away... but actually delivery of a SYN isn't
reliable either, and we have retries implemented with proper handling
of the case where we exceed the number of retries.

I think in general we should give ENOBUFS / -ENOBUFS (depending on the
specific convention of the given function) for functions that failed to
provide / reserve buffers, and -EAGAIN if we were trying to send an
ACK... or even just a SYN.

>     B1) For consistency we should propagate failures from
>         tcp_payload_flush(), through tcp_buf_send_flag() to its
> 	callers,
>     B2) We should only reset if tcp_send_flag() returns -ECONNRESET

Or, at this point, on anything that's not -EAGAIN?

> > > +
> > >  			tcp_timer_ctl(c, conn);
> > >  
> > >  			if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
> > > @@ -2043,14 +2045,16 @@ eintr:
> > >  			 *   Then swiftly looked away and left.
> > >  			 */
> > >  			conn->seq_from_tap = seq_from_tap;
> > > -			tcp_send_flag(c, conn, ACK);
> > > +			if (tcp_send_flag(c, conn, ACK))
> > > +				return -1;
> > >  		}
> > >  
> > >  		if (errno == EINTR)
> > >  			goto eintr;
> > >  
> > >  		if (errno == EAGAIN || errno == EWOULDBLOCK) {
> > > -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> > > +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> > > +				return -1;
> > >  			return p->count - idx;
> > >  
> > >  		}
> > > @@ -2070,7 +2074,8 @@ out:
> > >  		 */
> > >  		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
> > >  			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
> > > -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> > > +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> > > +				return -1;
> > >  		}
> > >  		return p->count - idx;
> > >  	}
> > > @@ -2084,7 +2089,8 @@ out:
> > >  
> > >  		conn_event(c, conn, TAP_FIN_RCVD);
> > >  	} else {
> > > -		tcp_send_flag(c, conn, ACK_IF_NEEDED);
> > > +		if (tcp_send_flag(c, conn, ACK_IF_NEEDED))
> > > +			return -1;
> > >  	}
> > >  
> > >  	return p->count - idx;
> > > @@ -2122,7 +2128,10 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
> > >  		return;
> > >  	}
> > >  
> > > -	tcp_send_flag(c, conn, ACK);
> > > +	if (tcp_send_flag(c, conn, ACK)) {
> > > +		tcp_rst(c, conn);
> > > +		return;
> > > +	}
> > >  
> > >  	/* The client might have sent data already, which we didn't
> > >  	 * dequeue waiting for SYN,ACK from tap -- check now.
> > > @@ -2308,7 +2317,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
> > >  				goto reset;
> > >  			}
> > >  
> > > -			tcp_send_flag(c, conn, ACK);
> > > +			if (tcp_send_flag(c, conn, ACK))
> > > +				goto reset;
> > > +
> > >  			conn_event(c, conn, SOCK_FIN_SENT);
> > >  
> > >  			return 1;
> > > @@ -2388,7 +2399,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
> > >  		}
> > >  
> > >  		conn_event(c, conn, SOCK_FIN_SENT);
> > > -		tcp_send_flag(c, conn, ACK);
> > > +		if (tcp_send_flag(c, conn, ACK))
> > > +			goto reset;
> > > +
> > >  		ack_due = 0;
> > >  
> > >  		/* If we received a FIN, but the socket is in TCP_ESTABLISHED
> > > @@ -2478,7 +2491,11 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
> > >  
> > >  	conn->wnd_from_tap = WINDOW_DEFAULT;
> > >  
> > > -	tcp_send_flag(c, conn, SYN);
> > > +	if (tcp_send_flag(c, conn, SYN)) {
> > > +		conn_flag(c, conn, CLOSING);  
> > 
> > I would wait for David to confirm, but I'm fairly sure that this needs
> > FLOW_ACTIVATE(conn); before returning, just like in the other error path
> > of this function, because otherwise we'll leave the newly created flow
> > in an "incomplete" state.  
> 
> Ah, yes, I missed that.
> 
> > Due to flow table restrictions we adopted to keep the implementation
> > simple (see "Theory of Operation - allocating and freeing flow entries"
> > in flow.c), quoting from the documentation to enum flow_state in
> > flow.h:
> > 
> >  *        Caveats:
> >  *            - At most one entry may be NEW, INI, TGT or TYPED at a time, so
> >  *              it's unsafe to use flow_alloc() again until this entry moves to
> >  *              ACTIVE or FREE
> > 
> > so, if we create a second connection within the same epoll cycle (for
> > example by calling tcp_tap_conn_from_sock() again), we'll now have two
> > entries in state TYPED, which breaks this assumption, and things will  
> 
> Exactly right.
> 
> > David, I think this isn't documented very obviously, even though it's
> > all there in flow.h. This just occurred to me because of commit
> > 52419a64f2df ("migrate, tcp: Don't flow_alloc_cancel() during incoming
> > migration") but we can't expect others to know about past commits.  
> 
> I agree it's not as clear as I'd like...
> 
> > I wonder if you could think of a quick way to make this more prominent...
> > should we perhaps state return conditions in functions, like you already
> > added for isolation.c?  
> 
> ... but I'm not really sure how to make it more prominent in a useful
> way.  Maybe a note in the function header would do the trick?  I'm not
> sure.

I don't think that's particularly helpful, because we have other
functions that would need that, and we might write more without
noticing.

I think we should, eventually, one day, have another look at the
documentation in flow.c and clearly state manipulation rules that are
otherwise only listed as comments to enums in flow.h.

That is, perhaps a new section with just those caveats, and in more
practical terms: instead of "at most one entry ...", something like
"make sure you call FLOW_ACTIVATE() after ...".

But regardless of that I don't really think it's a priority right now.
Let's keep this for later.

-- 
Stefano


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v1] tcp: Handle errors from tcp_send_flag()
  2026-04-20 23:41     ` Stefano Brivio
@ 2026-04-21  4:23       ` David Gibson
  0 siblings, 0 replies; 6+ messages in thread
From: David Gibson @ 2026-04-21  4:23 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Anshu Kumari, passt-dev, Laurent Vivier

[-- Attachment #1: Type: text/plain, Size: 13753 bytes --]

On Tue, Apr 21, 2026 at 01:41:03AM +0200, Stefano Brivio wrote:
> On Mon, 20 Apr 2026 12:47:58 +1000
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Wed, Apr 15, 2026 at 09:38:28PM +0200, Stefano Brivio wrote:
> > > Nit: v1 in the subject tag is not necessary (not harmful either): if
> > > there are no version tags, it's implicit we're talking about version 1.
> > > 
> > > On Fri, 10 Apr 2026 13:25:39 +0530
> > > Anshu Kumari <anskuma@redhat.com> wrote:
> > >   
> > > > tcp_send_flag() can return error codes from tcp_prepare_flags()
> > > > failing TCP_INFO, or from failure to collect buffers on the
> > > > vhost-user path. These errors indicate the connection requires
> > > > resetting.
> > > > 
> > > > Most callers of tcp_send_flag() were ignoring the error code and
> > > > carrying on as if nothing was wrong. Check the return value at
> > > > each call site and handle the error appropriately:
> > > >   - in tcp_data_from_tap(), return -1 so the caller resets
> > > >   - in tcp_tap_handler(), goto reset
> > > >   - in tcp_timer_handler()/tcp_sock_handler()/tcp_conn_from_sock_finish(),
> > > >     call tcp_rst() and return
> > > >   - in tcp_tap_conn_from_sock(), set CLOSING flag (flow not yet active)
> > > >   - in tcp_keepalive(), call tcp_rst() and continue the loop
> > > >   - in tcp_flow_migrate_target_ext(), goto fail
> > > > 
> > > > The call in tcp_rst_do() is left unchecked: we are already
> > > > resetting, and tcp_sock_rst() still needs to run regardless.
> > > > 
> > > > Bug: https://bugs.passt.top/show_bug.cgi?id=194  
> > > 
> > > Nit: we always use Link: tags (CONTRIBUTING.md uses the plural which
> > > might be a bit confusing, I guess we should fix that), rationale:
> > > 
> > >   https://archives.passt.top/passt-dev/20230704132104.48106368@elisabeth/
> > >   https://archives.passt.top/passt-dev/20251105163137.424a6537@elisabeth/
> > > 
> > > But I fix up these tags on merge anyway, no need to re-send (in
> > > general).
> > >   
> > > > Signed-off-by: Anshu Kumari <anskuma@redhat.com>
> > > > ---
> > > >  tcp.c | 59 ++++++++++++++++++++++++++++++++++++++++++++---------------
> > > >  1 file changed, 44 insertions(+), 15 deletions(-)
> > > > 
> > > > diff --git a/tcp.c b/tcp.c
> > > > index 8ea9be8..9ce671a 100644
> > > > --- a/tcp.c
> > > > +++ b/tcp.c
> > > > @@ -1917,7 +1917,9 @@ static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> > > >  				   "keep-alive sequence: %u, previous: %u",
> > > >  				   seq, conn->seq_from_tap);
> > > >  
> > > > -			tcp_send_flag(c, conn, ACK);
> > > > +			if (tcp_send_flag(c, conn, ACK))
> > > > +				return -1;  
> > > 
> > > A general comment: in _some_ of these cases where we fail to send ACK
> > > segments, I intentionally didn't check for errors and let the
> > > connection live on, because that looked like the most graceful failure
> > > handling to me.
> > > 
> > > After all, ACK segments without data are not assumed to be reliably
> > > transmitted (RFC 9293, 3.8.4), so, given that failing to send some
> > > should have a similar outcome as the peer missing some, I guess we're
> > > always expected to recover from a situation like that.
> > > 
> > > This doesn't apply to other occurrences below where we fail to send a
> > > SYN segment or where failure to send ACK segments might mean we are in
> > > some expected state (including a connection that might get stuck
> > > forever).
> > > 
> > > But reading David's description of bug #194, I wonder if he had
> > > something else in mind. That is, I don't have a strong preference
> > > against resetting the connection whenever we fail to prepare buffers,
> > > but in many of these cases we don't really _have to_ reset the
> > > connection. David, do you see this differently?  
> > 
> > That's a good point, which I didn't think about when I reviewed.  I
> > don't recall if I'd considered it when I filed the bug.
> > 
> > So, in principle, you're right; failing to send an ack is equivalent
> > to losing an ack en route, which generally shouldn't cause an
> > immediate reset of the connection in most cases.
> > 
> > But... that's complicated by the details of what errors can actually
> > occur here.  AFAICT there are only three cases:
> > 
> >  1) tcp_vu_send_flag() returns -1 because vu_collect() returned 0
> > 
> > IIUC this means we ran out of buffers.  We probably shouldn't reset in
> > this case.
> 
> Right.
> 
> >  2) tcp_vu_send_flag() returns -1 because vu_collect() returned > 1
> > 
> > This means the guest buffer layout wasn't what we expected.  We
> > probably shouldn't reset in this case, but I believe it will go away
> > with Laurent's pending multi-buffer vu patches anyway.  So we can
> > probably ignore this one.
> 
> Yes, that will go away in a bit, so we can ignore it.
> 
> > [Also, we according to docs and other usage we should return an errno
> >  in both these cases, not a bare -1]
> > 
> >  3) tcp_prepare_flags() returns -ECONNRESET
> > 
> > This only happens if TCP_INFO fails.  In this case we *should* reset -
> > in fact we already set CLOSED in the connection events.  Essentially,
> > we've concluded the socket side is irretrievably broken, so we should
> > shut down the tap side as well.
> 
> Agreed.
> 
> > So I think we have two options here:
> > 
> > A) tcp_send_flag() reports only "fatal" errors
> > 
> > Simpler to do, but possibly confusing.  We'd need to
> 
> Rather confusing for me indeed.
> 
> >     A1) Change tcp_vu_send_flag() not to report an error if
> >         vu_collect() returns 0.  AIUI, this case is more or less
> > 	equivalent to filling the tap socket queue, and we don't
> > 	report that case as an error in tcp_buf_send_flag().
> > 	tcp_payload_flush() reverts sequences in this case, but
> > 	doesn't report the error any further.
> > 
> >     A2) That leaves only the TCP_INFO failure case, so we can and
> >         should reset on any failure from tcp_send_flag(), just like
> > 	this draft
> 
> This looks like a stretch and almost a layering violation to me, that
> is, we would adapt the semantics of tcp_send_flag() to the caller
> instead of making it return what naturally makes sense.

Yeah, maybe.

> > B) tcp_send_flag() reports both transient and fatal errors
> > 
> > More complex, but maybe less surprising semantics?
> 
> Definitely less surprising I think.
> 
> > tcp_send_flag()
> > would need to use different return codes for the different cases
> > (maybe ECONNRESET vs. EBUSY?)
> 
> Or... if we run out of buffers, and we're sending an ACK, "try again":
> -EAGAIN.

Ah, yes EAGAIN is a better choice than EBUSY.

> I was assuming that if we're sending a SYN that's a different case and
> we should reset right away... but actually delivery of a SYN isn't
> reliable either, and we have retries implemented with proper handling
> of the case where we exceed the number of retries.
> 
> I think in general we should give ENOBUFS / -ENOBUFS (depending on the
> specific convention of the given function) for functions that failed to
> provide / reserve buffers, and -EAGAIN if we were trying to send an
> ACK... or even just a SYN.

It seems like a layering violation to me to peer into the contents of
what we're sending to alter the return code.  But I don't think we
need to; I think it makes more sense to always send EAGAIN if the
problem is something we expect to be transient - in practice, running
out of space on the tap side, whether that's buffers in the VU ring or
the tap socket's send buffer.

If a caller really can't tolerate a delay in sending, it still has the
option of aborting on an EAGAIN, but I don't see any reason we'd
actually want that.

> >     B1) For consistency we should propagate failures from
> >         tcp_payload_flush(), through tcp_buf_send_flag() to its
> > 	callers,
> >     B2) We should only reset if tcp_send_flag() returns -ECONNRESET
> 
> Or, at this point, on anything that's not -EAGAIN?

Right.

> > > > +
> > > >  			tcp_timer_ctl(c, conn);
> > > >  
> > > >  			if (setsockopt(conn->sock, SOL_SOCKET, SO_KEEPALIVE,
> > > > @@ -2043,14 +2045,16 @@ eintr:
> > > >  			 *   Then swiftly looked away and left.
> > > >  			 */
> > > >  			conn->seq_from_tap = seq_from_tap;
> > > > -			tcp_send_flag(c, conn, ACK);
> > > > +			if (tcp_send_flag(c, conn, ACK))
> > > > +				return -1;
> > > >  		}
> > > >  
> > > >  		if (errno == EINTR)
> > > >  			goto eintr;
> > > >  
> > > >  		if (errno == EAGAIN || errno == EWOULDBLOCK) {
> > > > -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> > > > +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> > > > +				return -1;
> > > >  			return p->count - idx;
> > > >  
> > > >  		}
> > > > @@ -2070,7 +2074,8 @@ out:
> > > >  		 */
> > > >  		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
> > > >  			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
> > > > -			tcp_send_flag(c, conn, ACK | DUP_ACK);
> > > > +			if (tcp_send_flag(c, conn, ACK | DUP_ACK))
> > > > +				return -1;
> > > >  		}
> > > >  		return p->count - idx;
> > > >  	}
> > > > @@ -2084,7 +2089,8 @@ out:
> > > >  
> > > >  		conn_event(c, conn, TAP_FIN_RCVD);
> > > >  	} else {
> > > > -		tcp_send_flag(c, conn, ACK_IF_NEEDED);
> > > > +		if (tcp_send_flag(c, conn, ACK_IF_NEEDED))
> > > > +			return -1;
> > > >  	}
> > > >  
> > > >  	return p->count - idx;
> > > > @@ -2122,7 +2128,10 @@ static void tcp_conn_from_sock_finish(const struct ctx *c,
> > > >  		return;
> > > >  	}
> > > >  
> > > > -	tcp_send_flag(c, conn, ACK);
> > > > +	if (tcp_send_flag(c, conn, ACK)) {
> > > > +		tcp_rst(c, conn);
> > > > +		return;
> > > > +	}
> > > >  
> > > >  	/* The client might have sent data already, which we didn't
> > > >  	 * dequeue waiting for SYN,ACK from tap -- check now.
> > > > @@ -2308,7 +2317,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
> > > >  				goto reset;
> > > >  			}
> > > >  
> > > > -			tcp_send_flag(c, conn, ACK);
> > > > +			if (tcp_send_flag(c, conn, ACK))
> > > > +				goto reset;
> > > > +
> > > >  			conn_event(c, conn, SOCK_FIN_SENT);
> > > >  
> > > >  			return 1;
> > > > @@ -2388,7 +2399,9 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
> > > >  		}
> > > >  
> > > >  		conn_event(c, conn, SOCK_FIN_SENT);
> > > > -		tcp_send_flag(c, conn, ACK);
> > > > +		if (tcp_send_flag(c, conn, ACK))
> > > > +			goto reset;
> > > > +
> > > >  		ack_due = 0;
> > > >  
> > > >  		/* If we received a FIN, but the socket is in TCP_ESTABLISHED
> > > > @@ -2478,7 +2491,11 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
> > > >  
> > > >  	conn->wnd_from_tap = WINDOW_DEFAULT;
> > > >  
> > > > -	tcp_send_flag(c, conn, SYN);
> > > > +	if (tcp_send_flag(c, conn, SYN)) {
> > > > +		conn_flag(c, conn, CLOSING);  
> > > 
> > > I would wait for David to confirm, but I'm fairly sure that this needs
> > > FLOW_ACTIVATE(conn); before returning, just like in the other error path
> > > of this function, because otherwise we'll leave the newly created flow
> > > in an "incomplete" state.  
> > 
> > Ah, yes, I missed that.
> > 
> > > Due to flow table restrictions we adopted to keep the implementation
> > > simple (see "Theory of Operation - allocating and freeing flow entries"
> > > in flow.c), quoting from the documentation to enum flow_state in
> > > flow.h:
> > > 
> > >  *        Caveats:
> > >  *            - At most one entry may be NEW, INI, TGT or TYPED at a time, so
> > >  *              it's unsafe to use flow_alloc() again until this entry moves to
> > >  *              ACTIVE or FREE
> > > 
> > > so, if we create a second connection within the same epoll cycle (for
> > > example by calling tcp_tap_conn_from_sock() again), we'll now have two
> > > entries in state TYPED, which breaks this assumption, and things will  
> > 
> > Exactly right.
> > 
> > > David, I think this isn't documented very obviously, even though it's
> > > all there in flow.h. This just occurred to me because of commit
> > > 52419a64f2df ("migrate, tcp: Don't flow_alloc_cancel() during incoming
> > > migration") but we can't expect others to know about past commits.  
> > 
> > I agree it's not as clear as I'd like...
> > 
> > > I wonder if you could think of a quick way to make this more prominent...
> > > should we perhaps state return conditions in functions, like you already
> > > added for isolation.c?  
> > 
> > ... but I'm not really sure how to make it more prominent in a useful
> > way.  Maybe a note in the function header would do the trick?  I'm not
> > sure.
> 
> I don't think that's particularly helpful, because we have other
> functions that would need that, and we might write more without
> noticing.
> 
> I think we should, eventually, one day, have another look at the
> documentation in flow.c and clearly state manipulation rules that are
> otherwise only listed as comments to enums in flow.h.
> 
> That is, perhaps a new section with just those caveats, and in more
> practical terms: instead of "at most one entry ...", something like
> "make sure you call FLOW_ACTIVATE() after ...".
> 
> But regardless of that I don't really think it's a priority right now.
> Let's keep this for later.

I concur.

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2026-04-21  4:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2026-04-10  7:55 [PATCH v1] tcp: Handle errors from tcp_send_flag() Anshu Kumari
2026-04-13  6:22 ` David Gibson
2026-04-15 19:38 ` Stefano Brivio
2026-04-20  2:47   ` David Gibson
2026-04-20 23:41     ` Stefano Brivio
2026-04-21  4:23       ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).