public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Stefano Brivio <sbrivio@redhat.com>
To: passt-dev@passt.top
Cc: Laurent Vivier <lvivier@redhat.com>,
	David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v3 20/20] Implement target side of migration
Date: Fri, 31 Jan 2025 20:39:53 +0100	[thread overview]
Message-ID: <20250131193953.3034031-21-sbrivio@redhat.com> (raw)
In-Reply-To: <20250131193953.3034031-1-sbrivio@redhat.com>

It's draft quality, with a number of hacks, and it will need a partial
rewrite. Add:

- flow_migrate_target_post(), to open target-side sockets and bind
  them, switch them to repair mode, connect them, and make them leave
  repair mode again

- copies of flow table, 'flow_first_free' pointer, related hash table,
  and hash secret. The copy of the hash secret shows that the current
  declarative approach to data sections has some drawbacks

Change tcp_flow_dump_seq() into tcp_flow_repair_seq(), which can dump
as well as restore sequences (used before connecting sockets).

Once we connect sockets, before we take them out of repair mode, we
need to restore MSS and window scaling information (what would be
determined by TCP options on handshake). I'm using hardcoded values as
we don't have a way to transfer these bits of socket-side information.

Before we turn repair mode off, add sockets to the epoll list and set
up per-socket timerfd descriptors, with initial timer scheduling.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     |  46 ++++++++++++++++++-
 flow.h     |   1 +
 migrate.c  |   9 ++++
 passt.c    |   4 ++
 passt.h    |   2 +
 tcp.c      | 132 +++++++++++++++++++++++++++++++++++++++++++++++++----
 tcp_conn.h |   4 +-
 7 files changed, 187 insertions(+), 11 deletions(-)

diff --git a/flow.c b/flow.c
index 506cbac..8fcf8c4 100644
--- a/flow.c
+++ b/flow.c
@@ -907,12 +907,56 @@ int flow_migrate_source_pre(struct ctx *c, struct migrate_meta *m)
 			i += flow->free.n - 1;
 		else if (flow->f.state == FLOW_STATE_ACTIVE &&
 			 flow->f.type == FLOW_TCP)
-			tcp_flow_dump_seq(c, &flow->tcp);
+			tcp_flow_repair_seq(c, &flow->tcp, false);
 	}
 
 	return 0;
 }
 
+/**
+ * flow_migrate_target_post() - Restore all flows after migration
+ * @c:		Execution context
+ * @m:		Migration metadata
+ *
+ * Return: 0 on success
+ */
+int flow_migrate_target_post(struct ctx *c, struct migrate_meta *m)
+{
+	unsigned i;
+	int rc;
+
+	(void)m;
+
+	for (i = 0; i < FLOW_MAX; i++) {	/* TODO: iterator with skip */
+		union flow *flow = &flowtab[i];
+
+		if (flow->f.state == FLOW_STATE_FREE)
+			i += flow->free.n - 1;
+		else if (flow->f.state == FLOW_STATE_ACTIVE &&
+			 flow->f.type == FLOW_TCP)
+			rc = tcp_flow_repair_socket(c, &flow->tcp);
+
+		if (rc)
+			return rc;		/* TODO: rollback */
+	}
+
+	repair_flush(c);			/* TODO: move to TCP logic */
+
+	for (i = 0; i < FLOW_MAX; i++) {	/* TODO: iterator with skip */
+		union flow *flow = &flowtab[i];
+
+		if (flow->f.state == FLOW_STATE_FREE)
+			i += flow->free.n - 1;
+		else if (flow->f.state == FLOW_STATE_ACTIVE &&
+			 flow->f.type == FLOW_TCP)
+			tcp_flow_repair_connect(c, &flow->tcp);
+	}
+
+	repair_flush(c);			/* TODO: move to TCP logic */
+
+	return 0;
+}
+
 /**
  * flow_init() - Initialise flow related data structures
  */
diff --git a/flow.h b/flow.h
index ff390a6..43fb507 100644
--- a/flow.h
+++ b/flow.h
@@ -256,6 +256,7 @@ union flow;
 void flow_init(void);
 void flow_defer_handler(const struct ctx *c, const struct timespec *now);
 int flow_migrate_source_pre(struct ctx *c, struct migrate_meta *m);
+int flow_migrate_target_post(struct ctx *c, struct migrate_meta *m);
 
 void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 	__attribute__((format(printf, 3, 4)));
diff --git a/migrate.c b/migrate.c
index faa7841..d47c44b 100644
--- a/migrate.c
+++ b/migrate.c
@@ -50,6 +50,12 @@ static union migrate_header header = {
 
 /* Data sections for version 1 */
 static struct iovec sections_v1[] = {
+	{ &header,		sizeof(header) },
+	{ &flow_first_free,	sizeof(flow_first_free) },
+	{ flowtab,		sizeof(flowtab) },
+	{ flow_hashtab,		sizeof(flow_hashtab) },
+	{ g_hash_secret,	sizeof(g_hash_secret) },
+	{ 0 },
 };
 
 /* Set of data versions */
@@ -78,6 +84,7 @@ struct migrate_handler handlers_target_pre_v1[] = {
 
 /* Handlers to call in target after receiving data with version 1 */
 struct migrate_handler handlers_target_post_v1[] = {
+	{ flow_migrate_target_post },
 	{ 0 },
 };
 
@@ -292,6 +299,8 @@ static void migrate_target_post(struct ctx *c, struct migrate_meta *m)
 	struct migrate_target_handlers *th;
 	struct migrate_handler *h;
 
+	memcpy(c->hash_secret, g_hash_secret, sizeof(g_hash_secret));
+
 	for (th = target_handlers; th->v != m->v && th->v; th++);
 
 	for (h = th->post; h->fn; h++)
diff --git a/passt.c b/passt.c
index 1938290..65e9126 100644
--- a/passt.c
+++ b/passt.c
@@ -119,6 +119,8 @@ static void post_handler(struct ctx *c, const struct timespec *now)
 		ndp_timer(c, now);
 }
 
+uint64_t g_hash_secret[2];
+
 /**
  * random_init() - Initialise things based on random data
  * @c:		Execution context
@@ -130,6 +132,8 @@ static void random_init(struct ctx *c)
 	/* Create secret value for SipHash calculations */
 	raw_random(&c->hash_secret, sizeof(c->hash_secret));
 
+	memcpy(g_hash_secret, c->hash_secret, sizeof(g_hash_secret));
+
 	/* Seed pseudo-RNG for things that need non-cryptographic random */
 	raw_random(&seed, sizeof(seed));
 	srandom(seed);
diff --git a/passt.h b/passt.h
index 4189a4a..6010f92 100644
--- a/passt.h
+++ b/passt.h
@@ -317,6 +317,8 @@ struct ctx {
 	bool migrate_target;
 };
 
+extern uint64_t g_hash_secret[2];
+
 void proto_update_l2_buf(const unsigned char *eth_d,
 			 const unsigned char *eth_s);
 
diff --git a/tcp.c b/tcp.c
index 4fd405b..d45edaf 100644
--- a/tcp.c
+++ b/tcp.c
@@ -887,13 +887,31 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
 }
 
 /**
- * tcp_flow_dump_seq() - Dump sequences for send and receive queues
+ * tcp_flow_repair_off() - Clear repair mode for a single TCP flow
  * @c:		Execution context
  * @conn:	Pointer to the TCP connection structure
  *
  * Return: 0 on success, negative error code on failure
  */
-int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+		err("Failed to clear TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_seq() - Dump or set sequences for socket queues
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ * @set:	Set if true, dump if false
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_seq(struct ctx *c, struct tcp_tap_conn *conn, bool set)
 {
 	int v, s = conn->sock;
 	socklen_t vlen;
@@ -902,28 +920,124 @@ int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn)
 
 	vlen = sizeof(v);
 
-	v = TCP_SEND_QUEUE;
 	/* TODO: proper error management and prints */
-	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
-		return -errno;
 
-	if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd, &vlen))
+	v = TCP_SEND_QUEUE;
+	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
 		return -errno;
 
-	debug("Send queue sequence %u for socket %i", conn->sock_seq_snd, s);
+	if (set) {
+		if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd,
+			       vlen))
+			return -errno;
+		debug("Set send queue sequence for socket %i to %u",
+		      s, conn->sock_seq_snd);
+	} else {
+		if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd,
+			       &vlen))
+			return -errno;
+		debug("Dumped send queue sequence for socket %i: %u",
+		      s, conn->sock_seq_snd);
+	}
 
 	v = TCP_RECV_QUEUE;
 	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
 		return -errno;
 
-	if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv, &vlen))
+	if (set) {
+		if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv,
+			       vlen))
+			return -errno;
+		debug("Set receive queue sequence for socket %i to %u",
+		      s, conn->sock_seq_rcv);
+	} else {
+		if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv,
+			       &vlen))
+			return -errno;
+		debug("Dumped receive queue sequence for socket %i: %u",
+		      s, conn->sock_seq_rcv);
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+	const struct flowside *sockside = HOSTFLOW(conn);
+	struct sockaddr_in a;
+	int rc;
+
+	a = (struct sockaddr_in){ af, htons(sockside->oport), { 0 }, { 0 } };
+
+	if ((conn->sock = socket(af, SOCK_STREAM, IPPROTO_TCP)) < 0)
+		return -errno;
+
+	/* On the same host, source socket can be in TIME_WAIT */
+	setsockopt(conn->sock, SOL_SOCKET, SO_REUSEADDR,
+		   &((int){ 1 }), sizeof(int));
+
+	if (bind(conn->sock, (struct sockaddr *)&a, sizeof(a)) < 0) {
+		close(conn->sock);
+		conn->sock = -1;
 		return -errno;
+	}
 
-	debug("Receive queue sequence %u for socket %i", conn->sock_seq_rcv, s);
+	rc = tcp_flow_repair_on(c, conn);
+	if (rc) {
+		close(conn->sock);
+		conn->sock = -1;
+		return rc;
+	}
 
 	return 0;
 }
 
+/**
+ * tcp_flow_repair_connect() - Connect sockets in repair mode, then turn it off
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_connect(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	struct flowside *tgt = &conn->f.side[TGTSIDE];
+	struct tcp_repair_opt opts[2];
+
+	tcp_flow_repair_seq(c, conn, true);
+
+	flowside_connect(c, conn->sock, PIF_HOST, tgt);
+
+	/* FIXME: Fetch those with TCP_REPAIR_OPTIONS and store in migration
+	 * data. These hardcoded values just happen to be good enough.
+	 *
+	 * On top of these, to seamlessly restore the window, we also need to
+	 * dump and restore struct tcp_repair_window via TCP_REPAIR_WINDOW.
+	 */
+	opts[0].opt_code = TCPOPT_WINDOW;
+	opts[0].opt_val = 8 + (8 << 16);
+
+	opts[1].opt_code = TCPOPT_MAXSEG;
+	opts[1].opt_val = 65495;
+
+	setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS,
+		   opts, 2 * sizeof(struct tcp_repair_opt));
+
+	conn->in_epoll = 0;
+	conn->timer = -1;
+	tcp_epoll_ctl(c, conn);
+
+	return tcp_flow_repair_off(c, conn);
+}
+
 /**
  * tcp_fill_header() - Fill the TCP header fields for a given TCP segment.
  *
diff --git a/tcp_conn.h b/tcp_conn.h
index 0c3e197..3bf8837 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -144,7 +144,9 @@ extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
 
 bool tcp_flow_defer(const struct tcp_tap_conn *conn);
 int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
-int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_seq(struct ctx *c, struct tcp_tap_conn *conn, bool set);
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_connect(struct ctx *c, struct tcp_tap_conn *conn);
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
 void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);
-- 
@@ -144,7 +144,9 @@ extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
 
 bool tcp_flow_defer(const struct tcp_tap_conn *conn);
 int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
-int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_seq(struct ctx *c, struct tcp_tap_conn *conn, bool set);
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_connect(struct ctx *c, struct tcp_tap_conn *conn);
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
 void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);
-- 
2.43.0


      parent reply	other threads:[~2025-01-31 19:39 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 01/20] tcp: Always pass NULL event with EPOLL_CTL_DEL Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 02/20] util: Rename and make global vu_remove_watch() Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 03/20] icmp, udp: Pad time_t timestamp to 64-bit to ease state migration Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 04/20] flow, flow_table: Pad flow table entries to 128 bytes, hash entries to 32 bits Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 05/20] flow_table: Use size in extern declaration for flowtab Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 06/20] util: Add read_remainder() and read_all_buf() Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 07/20] Introduce facilities for guest migration on top of vhost-user infrastructure Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 08/20] Introduce passt-repair Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 09/20] Add interfaces and configuration bits for passt-repair Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 10/20] flow, tcp: Basic pre-migration source handler to dump sequence numbers Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 11/20] migrate: vu_migrate_{source,target}() aren't actually vu speciic Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 12/20] migrate: Move repair_sock_init() to vu_init() Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 13/20] migrate: Make more handling common rather than vhost-user specific Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 14/20] migrate: Don't handle the migration channel through epoll Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 15/20] flow, flow_table: Export declaration of hash table Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 16/20] vhost_user: Turn vhost-user message reports to trace() Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 17/20] vhost_user: Make source quit after reporting migration state Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 18/20] tcp: Get our socket port using getsockname() when connecting from guest Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 19/20] tcp: Add HOSTSIDE(x), HOSTFLOW(x) macros Stefano Brivio
2025-01-31 19:39 ` Stefano Brivio [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250131193953.3034031-21-sbrivio@redhat.com \
    --to=sbrivio@redhat.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).