From mboxrd@z Thu Jan  1 00:00:00 1970
Received: by passt.top (Postfix, from userid 1000)
	id 5554D5A0283; Wed, 04 Jun 2025 18:37:46 +0200 (CEST)
From: Stefano Brivio <sbrivio@redhat.com>
To: passt-dev@passt.top
Subject: [PATCH] flow, repair: Proper error handling for missing passt-repair helper on target
Date: Wed,  4 Jun 2025 18:37:46 +0200
Message-ID: <20250604163746.240348-1-sbrivio@redhat.com>
X-Mailer: git-send-email 2.43.0
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Message-ID-Hash: ZHLERFCVJJ2IIRWLMM3GB7P3ZKH5YI6G
X-Message-ID-Hash: ZHLERFCVJJ2IIRWLMM3GB7P3ZKH5YI6G
X-MailFrom: sbrivio@passt.top
X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; emergency; loop; banned-address; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header
X-Mailman-Version: 3.3.8
Precedence: list
List-Id: Development discussion and patches for passt <passt-dev.passt.top>
Archived-At: <https://archives.passt.top/passt-dev/20250604163746.240348-1-sbrivio@redhat.com/>
Archived-At: <https://passt.top/hyperkitty/list/passt-dev@passt.top/message/ZHLERFCVJJ2IIRWLMM3GB7P3ZKH5YI6G/>
List-Archive: <https://archives.passt.top/passt-dev/>
List-Archive: <https://passt.top/hyperkitty/list/passt-dev@passt.top/>
List-Help: <mailto:passt-dev-request@passt.top?subject=help>
List-Owner: <mailto:passt-dev-owner@passt.top>
List-Post: <mailto:passt-dev@passt.top>
List-Subscribe: <mailto:passt-dev-join@passt.top>
List-Unsubscribe: <mailto:passt-dev-leave@passt.top>

If the target doesn't have a connected passt-repair helper,
repair_wait() sets a timeout and forcibly calls accept4(), but if the
timeout expires, we don't report any error and proceed as if we had a
connected helper.

Later, in repair_flush(), sendmsg() will fail for each flow we try to
migrate with EBADF, and we'll handle that gracefully, but it makes no
sense to even try, and it also leads to noise in the logs.

Similarly to how we handle a missing repair helper in
flow_migrate_repair_all(), propagate timeout and other errors through
repair_wait(), and don't attempt to migrate flows if repair_wait()
failed.

Fixes: c8b520c0625b ("flow, repair: Wait for a short while for passt-repair to connect")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c   |  3 ++-
 repair.c | 38 ++++++++++++++++++++++++++++----------
 repair.h |  4 ++--
 3 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/flow.c b/flow.c
index cc05033..da5c813 100644
--- a/flow.c
+++ b/flow.c
@@ -1130,7 +1130,8 @@ int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
 	if (!count)
 		return 0;
 
-	repair_wait(c);
+	if ((rc = repair_wait(c)))
+		return -rc;
 
 	if ((rc = flow_migrate_repair_all(c, true)))
 		return -rc;
diff --git a/repair.c b/repair.c
index 149fe51..f6b1bf3 100644
--- a/repair.c
+++ b/repair.c
@@ -68,18 +68,21 @@ void repair_sock_init(const struct ctx *c)
  * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
  * @c:		Execution context
  * @events:	epoll events
+ *
+ * Return: 0 on valid event with new connected socket, error code on failure
  */
-void repair_listen_handler(struct ctx *c, uint32_t events)
+int repair_listen_handler(struct ctx *c, uint32_t events)
 {
 	union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
 	struct epoll_event ev = { 0 };
 	struct ucred ucred;
 	socklen_t len;
+	int rc;
 
 	if (events != EPOLLIN) {
 		debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
 		      events);
-		return;
+		return EINVAL;
 	}
 
 	len = sizeof(ucred);
@@ -90,18 +93,19 @@ void repair_listen_handler(struct ctx *c, uint32_t events)
 				      SOCK_NONBLOCK);
 
 		if (discard == -1)
-			return;
+			return errno;
 
 		if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
 			info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
 
 		close(discard);
-		return;
+		return EEXIST;
 	}
 
 	if ((c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0)) < 0) {
+		rc = errno;
 		debug_perror("accept4() on TCP_REPAIR helper listening socket");
-		return;
+		return rc;
 	}
 
 	if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
@@ -111,10 +115,14 @@ void repair_listen_handler(struct ctx *c, uint32_t events)
 	ev.events = EPOLLHUP | EPOLLET;
 	ev.data.u64 = ref.u64;
 	if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev)) {
+		rc = errno;
 		debug_perror("epoll_ctl() on TCP_REPAIR helper socket");
 		close(c->fd_repair);
 		c->fd_repair = -1;
+		return rc;
 	}
+
+	return 0;
 }
 
 /**
@@ -145,29 +153,39 @@ void repair_handler(struct ctx *c, uint32_t events)
 /**
  * repair_wait() - Wait (with timeout) for TCP_REPAIR helper to connect
  * @c:		Execution context
+ *
+ * Return: 0 on success or if already connected, error code on failure
  */
-void repair_wait(struct ctx *c)
+int repair_wait(struct ctx *c)
 {
 	struct timeval tv = { .tv_sec = 0,
 			      .tv_usec = (long)(REPAIR_ACCEPT_TIMEOUT_US) };
+	int rc;
+
 	static_assert(REPAIR_ACCEPT_TIMEOUT_US < 1000 * 1000,
 		      ".tv_usec is greater than 1000 * 1000");
 
-	if (c->fd_repair >= 0 || c->fd_repair_listen == -1)
-		return;
+	if (c->fd_repair >= 0)
+		return 0;
+
+	if (c->fd_repair_listen == -1)
+		return ENOENT;
 
 	if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
 		       &tv, sizeof(tv))) {
+		rc = errno;
 		err_perror("Set timeout on TCP_REPAIR listening socket");
-		return;
+		return rc;
 	}
 
-	repair_listen_handler(c, EPOLLIN);
+	rc = repair_listen_handler(c, EPOLLIN);
 
 	tv.tv_usec = 0;
 	if (setsockopt(c->fd_repair_listen, SOL_SOCKET, SO_RCVTIMEO,
 		       &tv, sizeof(tv)))
 		err_perror("Clear timeout on TCP_REPAIR listening socket");
+
+	return rc;
 }
 
 /**
diff --git a/repair.h b/repair.h
index 1d37922..ab27e67 100644
--- a/repair.h
+++ b/repair.h
@@ -7,10 +7,10 @@
 #define REPAIR_H
 
 void repair_sock_init(const struct ctx *c);
-void repair_listen_handler(struct ctx *c, uint32_t events);
+int repair_listen_handler(struct ctx *c, uint32_t events);
 void repair_handler(struct ctx *c, uint32_t events);
 void repair_close(struct ctx *c);
-void repair_wait(struct ctx *c);
+int repair_wait(struct ctx *c);
 int repair_flush(struct ctx *c);
 int repair_set(struct ctx *c, int s, int cmd);
 
-- 
2.43.0