public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH] treewide: By default, don't quit source after migration, keep sockets open
@ 2025-07-21 22:12 Stefano Brivio
  2025-07-22  0:33 ` David Gibson
  0 siblings, 1 reply; 6+ messages in thread
From: Stefano Brivio @ 2025-07-21 22:12 UTC (permalink / raw)
  To: passt-dev; +Cc: David Gibson, Nir Dothan

We are hitting an issue in the KubeVirt integration where some data is
still sent to the source instance even after migration is complete. As
we exit, the kernel closes our sockets and resets connections. The
resulting RST segments are sent to peers, effectively terminating
connections that were meanwhile migrated.

At the moment, this is not done intentionally, but in the future
KubeVirt might enable OVN-Kubernetes features where source and
destination nodes are explicitly getting mirrored traffic for a while,
in order to decrease migration downtime.

By default, don't quit after migration is completed on the source: the
previous behaviour can be enabled with the new --migrate-exit option.

Also, by default, keep migrated TCP sockets open (in repair mode) as
long as we're running, and ignore events on any epoll descriptor
representing data channels. The previous (and, strictly speaking,
correct) behaviour can be enabled with the new --migrate-no-linger
option.

Reported-by: Nir Dothan <ndothan@redhat.com>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 conf.c         | 23 +++++++++++++++++++++++
 epoll_type.h   | 12 ++++++++----
 flow.c         |  2 +-
 passt.1        | 14 ++++++++++++++
 passt.c        |  6 +++++-
 passt.h        |  6 ++++++
 tcp.c          |  7 +++++--
 tcp_conn.h     |  3 ++-
 test/lib/setup |  4 ++--
 vhost_user.c   | 10 ++++++++--
 10 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/conf.c b/conf.c
index 6c747aa..5e69014 100644
--- a/conf.c
+++ b/conf.c
@@ -864,6 +864,12 @@ static void usage(const char *name, FILE *f, int status)
 		FPRINTF(f,
 			"  --repair-path PATH	path for passt-repair(1)\n"
 			"    default: append '.repair' to UNIX domain path\n");
+		FPRINTF(f,
+			"  --migrate-exit	source quits after migration\n"
+			"    default: source keeps running after migration\n");
+		FPRINTF(f,
+			"  --migrate-no-linger	close sockets on migration\n"
+			"    default: keep sockets open, ignore data events\n");
 	}
 
 	FPRINTF(f,
@@ -1470,6 +1476,8 @@ void conf(struct ctx *c, int argc, char **argv)
 		{"socket-path",	required_argument,	NULL,		's' },
 		{"fqdn",	required_argument,	NULL,		27 },
 		{"repair-path",	required_argument,	NULL,		28 },
+		{"migrate-exit", no_argument,		NULL,		29 },
+		{"migrate-no-linger", no_argument,	NULL,		30 },
 		{ 0 },
 	};
 	const char *optstring = "+dqfel:hs:F:I:p:P:m:a:n:M:g:i:o:D:S:H:461t:u:T:U:";
@@ -1495,6 +1503,9 @@ void conf(struct ctx *c, int argc, char **argv)
 		fwd_default = FWD_AUTO;
 	}
 
+	if (c->mode == MODE_VU)
+		c->migrate_linger = true;
+
 	if (tap_l2_max_len(c) - ETH_HLEN < max_mtu)
 		max_mtu = tap_l2_max_len(c) - ETH_HLEN;
 	c->mtu = ROUND_DOWN(max_mtu, sizeof(uint32_t));
@@ -1686,6 +1697,18 @@ void conf(struct ctx *c, int argc, char **argv)
 					   optarg))
 				die("Invalid passt-repair path: %s", optarg);
 
+			break;
+		case 29:
+			if (c->mode != MODE_VU)
+				die("--migrate-exit is for vhost-user mode only");
+			c->migrate_exit = true;
+
+			break;
+		case 30:
+			if (c->mode != MODE_VU)
+				die("--migrate-no-linger is for vhost-user mode only");
+			c->migrate_linger = false;
+
 			break;
 		case 'd':
 			c->debug = 1;
diff --git a/epoll_type.h b/epoll_type.h
index 12ac59b..f2991b6 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -12,6 +12,7 @@
 enum epoll_type {
 	/* Special value to indicate an invalid type */
 	EPOLL_TYPE_NONE = 0,
+
 	/* Connected TCP sockets */
 	EPOLL_TYPE_TCP,
 	/* Connected TCP sockets (spliced) */
@@ -26,16 +27,19 @@ enum epoll_type {
 	EPOLL_TYPE_UDP,
 	/* ICMP/ICMPv6 ping sockets */
 	EPOLL_TYPE_PING,
-	/* inotify fd watching for end of netns (pasta) */
-	EPOLL_TYPE_NSQUIT_INOTIFY,
-	/* timer fd watching for end of netns, fallback for inotify (pasta) */
-	EPOLL_TYPE_NSQUIT_TIMER,
 	/* tuntap character device */
 	EPOLL_TYPE_TAP_PASTA,
 	/* socket connected to qemu  */
 	EPOLL_TYPE_TAP_PASST,
 	/* socket listening for qemu socket connections */
 	EPOLL_TYPE_TAP_LISTEN,
+	/* End of event types involving data transfers or connections */
+	EPOLL_TYPE_DATA_MAX = EPOLL_TYPE_TAP_LISTEN,
+
+	/* inotify fd watching for end of netns (pasta) */
+	EPOLL_TYPE_NSQUIT_INOTIFY,
+	/* timer fd watching for end of netns, fallback for inotify (pasta) */
+	EPOLL_TYPE_NSQUIT_TIMER,
 	/* vhost-user command socket */
 	EPOLL_TYPE_VHOST_CMD,
 	/* vhost-user kick event socket */
diff --git a/flow.c b/flow.c
index 00885f6..feefda3 100644
--- a/flow.c
+++ b/flow.c
@@ -1091,7 +1091,7 @@ int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
 	 * as EIO).
 	 */
 	foreach_established_tcp_flow(flow) {
-		rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
+		rc = tcp_flow_migrate_source_ext(c, fd, &flow->tcp);
 		if (rc) {
 			flow_err(flow, "Can't send extended data: %s",
 				 strerror_(-rc));
diff --git a/passt.1 b/passt.1
index 60066c2..b85aaa0 100644
--- a/passt.1
+++ b/passt.1
@@ -439,6 +439,20 @@ Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
 chosen for the hypervisor UNIX domain socket. No socket is created if not in
 \-\-vhost-user mode.
 
+.TP
+.BR \-\-migrate-exit
+Exit after a completed migration as source. By default, \fBpasst\fR keeps
+running and the migrated guest can continue using its connection, or a new guest
+can connect.
+
+.TP
+.BR \-\-migrate-no-linger
+Close TCP sockets on the source instance once migration completes.
+
+By default, sockets are kept open, and events on data sockets are ignored, so
+that any further message reaching sockets after the source migrated is silently
+ignored, to avoid connection resets in case data is received after migration.
+
 .TP
 .BR \-F ", " \-\-fd " " \fIFD
 Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
diff --git a/passt.c b/passt.c
index 388d10f..f4c108f 100644
--- a/passt.c
+++ b/passt.c
@@ -308,6 +308,9 @@ loop:
 		      c.mode == MODE_PASTA ? "pasta" : "passt",
 		      EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
 
+		if (c.ignore_data_events && ref.type <= EPOLL_TYPE_DATA_MAX)
+			continue;
+
 		switch (ref.type) {
 		case EPOLL_TYPE_TAP_PASTA:
 			tap_handler_pasta(&c, eventmask, &now);
@@ -363,7 +366,8 @@ loop:
 		}
 	}
 
-	post_handler(&c, &now);
+	if (!c.ignore_data_events)
+		post_handler(&c, &now);
 
 	migrate_handler(&c);
 
diff --git a/passt.h b/passt.h
index 8693794..636b3c3 100644
--- a/passt.h
+++ b/passt.h
@@ -241,6 +241,9 @@ struct ip6_ctx {
  * @device_state_fd:	Device state migration channel
  * @device_state_result: Device state migration result
  * @migrate_target:	Are we the target, on the next migration request?
+ * @migrate_linger:	Keep sockets open after migration, ignore data events
+ * @migrate_exit:	Exit (on source) once migration is complete
+ * @ignore_data_events:	Ignore data events (for migration, source instance)
  */
 struct ctx {
 	enum passt_modes mode;
@@ -318,6 +321,9 @@ struct ctx {
 	int device_state_fd;
 	int device_state_result;
 	bool migrate_target;
+	bool migrate_linger;
+	bool migrate_exit;
+	bool ignore_data_events;
 };
 
 void proto_update_l2_buf(const unsigned char *eth_d,
diff --git a/tcp.c b/tcp.c
index 2b88466..1f7a6ab 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3286,12 +3286,14 @@ int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
 
 /**
  * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
+ * @c:		Execution context
  * @fd:		Descriptor for state migration
  * @conn:	Pointer to the TCP connection structure
  *
  * Return: 0 on success, negative (not -EIO) on failure, -EIO on sending failure
  */
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
+int tcp_flow_migrate_source_ext(struct ctx *c,
+				int fd, const struct tcp_tap_conn *conn)
 {
 	uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
 	struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
@@ -3336,7 +3338,8 @@ int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
 	if ((rc = tcp_flow_dump_seq(conn, &t->seq_rcv)))
 		goto fail;
 
-	close(s);
+	if (!c->migrate_linger)
+		close(s);
 
 	/* Adjustments unrelated to FIN segments: sequence numbers we dumped are
 	 * based on the end of the queues.
diff --git a/tcp_conn.h b/tcp_conn.h
index 35d813d..d49ae88 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -236,7 +236,8 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
 int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
 
 int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
-int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(struct ctx *c, int fd,
+				const struct tcp_tap_conn *conn);
 
 int tcp_flow_migrate_target(struct ctx *c, int fd);
 int tcp_flow_migrate_target_ext(struct ctx *c, struct tcp_tap_conn *conn, int fd);
diff --git a/test/lib/setup b/test/lib/setup
index 575bc21..5994598 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -350,7 +350,7 @@ setup_migrate() {
 
 	sleep 1
 
-	__opts="--vhost-user"
+	__opts="--vhost-user --migrate-exit --migrate-no-linger"
 	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
 	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
@@ -360,7 +360,7 @@ setup_migrate() {
 
 	context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
 
-	__opts="--vhost-user"
+	__opts="--vhost-user --migrate-exit --migrate-no-linger"
 	[ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
 	[ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
 	[ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
diff --git a/vhost_user.c b/vhost_user.c
index e7fb049..7fd27dc 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -1207,7 +1207,13 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
 	if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
 	    vdev->context->device_state_result == 0 &&
 	    !vdev->context->migrate_target) {
-		info("Migration complete, exiting");
-		_exit(EXIT_SUCCESS);
+		if (vdev->context->migrate_exit) {
+			info("Migration complete, exiting");
+			_exit(EXIT_SUCCESS);
+		}
+
+		info("Migration complete");
+		if (vdev->context->migrate_linger)
+			vdev->context->ignore_data_events = true;
 	}
 }
-- 
@@ -1207,7 +1207,13 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
 	if (vmsg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE &&
 	    vdev->context->device_state_result == 0 &&
 	    !vdev->context->migrate_target) {
-		info("Migration complete, exiting");
-		_exit(EXIT_SUCCESS);
+		if (vdev->context->migrate_exit) {
+			info("Migration complete, exiting");
+			_exit(EXIT_SUCCESS);
+		}
+
+		info("Migration complete");
+		if (vdev->context->migrate_linger)
+			vdev->context->ignore_data_events = true;
 	}
 }
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-07-24  1:50 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-07-21 22:12 [PATCH] treewide: By default, don't quit source after migration, keep sockets open Stefano Brivio
2025-07-22  0:33 ` David Gibson
2025-07-22 21:12   ` Stefano Brivio
2025-07-23  0:27     ` David Gibson
2025-07-23  9:17       ` Stefano Brivio
2025-07-24  1:48         ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).