public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Stefano Brivio <sbrivio@redhat.com>
To: passt-dev@passt.top
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v4 8/8] Implement target side of migration
Date: Tue,  4 Feb 2025 01:47:45 +0100	[thread overview]
Message-ID: <20250204004745.97854-9-sbrivio@redhat.com> (raw)
In-Reply-To: <20250204004745.97854-1-sbrivio@redhat.com>

It's draft quality, with a number of hacks, and it will need a partial
rewrite. Add:

- flow_migrate_target_post(), to open target-side sockets and bind
  them, switch them to repair mode, connect them, and make them leave
  repair mode again

- copies of flow table, 'flow_first_free' pointer, related hash table,
  and hash secret. The copy of the hash secret shows that the current
  declarative approach to data sections has some drawbacks

Change tcp_flow_dump_seq() into tcp_flow_repair_seq(), which can dump
as well as restore sequences (used before connecting sockets).

Once we connect sockets, before we take them out of repair mode, we
need to restore MSS and window scaling information (what would be
determined by TCP options on handshake). I'm using hardcoded values as
we don't have a way to transfer these bits of socket-side information.

Before we turn repair mode off, add sockets to the epoll list and set
up per-socket timerfd descriptors, with initial timer scheduling.

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     |  51 ++++++++++++++++++++-
 flow.h     |   2 +
 migrate.c  |   1 +
 passt.c    |   4 ++
 passt.h    |   2 +
 tcp.c      | 132 +++++++++++++++++++++++++++++++++++++++++++++++++----
 tcp_conn.h |   4 +-
 7 files changed, 185 insertions(+), 11 deletions(-)

diff --git a/flow.c b/flow.c
index da738eb..aaf200f 100644
--- a/flow.c
+++ b/flow.c
@@ -913,12 +913,61 @@ int flow_migrate_source_pre(struct ctx *c, struct migrate_meta *m,
 			i += flow->free.n - 1;
 		else if (flow->f.state == FLOW_STATE_ACTIVE &&
 			 flow->f.type == FLOW_TCP)
-			tcp_flow_dump_seq(c, &flow->tcp);
+			tcp_flow_repair_seq(c, &flow->tcp, false);
 	}
 
 	return 0;
 }
 
+/**
+ * flow_migrate_target_post() - Restore all flows after migration
+ * @c:		Execution context
+ * @m:		Migration metadata
+ * @stage:	Migration stage information (unused)
+ * @fd:		Migration fd (unused)
+ *
+ * Return: 0 on success
+ */
+int flow_migrate_target_post(struct ctx *c, struct migrate_meta *m,
+			     const struct migrate_stage *stage, int fd)
+{
+	unsigned i;
+	int rc;
+
+	(void)m;
+	(void)stage;
+	(void)fd;
+
+	for (i = 0; i < FLOW_MAX; i++) {	/* TODO: iterator with skip */
+		union flow *flow = &flowtab[i];
+
+		if (flow->f.state == FLOW_STATE_FREE)
+			i += flow->free.n - 1;
+		else if (flow->f.state == FLOW_STATE_ACTIVE &&
+			 flow->f.type == FLOW_TCP)
+			rc = tcp_flow_repair_socket(c, &flow->tcp);
+
+		if (rc)
+			return rc;		/* TODO: rollback */
+	}
+
+	repair_flush(c);			/* TODO: move to TCP logic */
+
+	for (i = 0; i < FLOW_MAX; i++) {	/* TODO: iterator with skip */
+		union flow *flow = &flowtab[i];
+
+		if (flow->f.state == FLOW_STATE_FREE)
+			i += flow->free.n - 1;
+		else if (flow->f.state == FLOW_STATE_ACTIVE &&
+			 flow->f.type == FLOW_TCP)
+			tcp_flow_repair_connect(c, &flow->tcp);
+	}
+
+	repair_flush(c);			/* TODO: move to TCP logic */
+
+	return 0;
+}
+
 /**
  * flow_init() - Initialise flow related data structures
  */
diff --git a/flow.h b/flow.h
index c526938..fec124f 100644
--- a/flow.h
+++ b/flow.h
@@ -251,6 +251,8 @@ void flow_init(void);
 void flow_defer_handler(const struct ctx *c, const struct timespec *now);
 int flow_migrate_source_pre(struct ctx *c, struct migrate_meta *m,
 			    const struct migrate_stage *stage, int fd);
+int flow_migrate_target_post(struct ctx *c, struct migrate_meta *m,
+			     const struct migrate_stage *stage, int fd);
 
 void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 	__attribute__((format(printf, 3, 4)));
diff --git a/migrate.c b/migrate.c
index b42b12f..2af7f64 100644
--- a/migrate.c
+++ b/migrate.c
@@ -113,6 +113,7 @@ static const struct migrate_stage stages_v1[] = {
 	{
 		.name = "flow post",
 		.source = NULL,
+		.target = flow_migrate_target_post,
 	},
 };
 
diff --git a/passt.c b/passt.c
index 1938290..65e9126 100644
--- a/passt.c
+++ b/passt.c
@@ -119,6 +119,8 @@ static void post_handler(struct ctx *c, const struct timespec *now)
 		ndp_timer(c, now);
 }
 
+uint64_t g_hash_secret[2];
+
 /**
  * random_init() - Initialise things based on random data
  * @c:		Execution context
@@ -130,6 +132,8 @@ static void random_init(struct ctx *c)
 	/* Create secret value for SipHash calculations */
 	raw_random(&c->hash_secret, sizeof(c->hash_secret));
 
+	memcpy(g_hash_secret, c->hash_secret, sizeof(g_hash_secret));
+
 	/* Seed pseudo-RNG for things that need non-cryptographic random */
 	raw_random(&seed, sizeof(seed));
 	srandom(seed);
diff --git a/passt.h b/passt.h
index 4189a4a..6010f92 100644
--- a/passt.h
+++ b/passt.h
@@ -317,6 +317,8 @@ struct ctx {
 	bool migrate_target;
 };
 
+extern uint64_t g_hash_secret[2];
+
 void proto_update_l2_buf(const unsigned char *eth_d,
 			 const unsigned char *eth_s);
 
diff --git a/tcp.c b/tcp.c
index 96d7649..3760e67 100644
--- a/tcp.c
+++ b/tcp.c
@@ -887,13 +887,31 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
 }
 
 /**
- * tcp_flow_dump_seq() - Dump sequences for send and receive queues
+ * tcp_flow_repair_off() - Clear repair mode for a single TCP flow
  * @c:		Execution context
  * @conn:	Pointer to the TCP connection structure
  *
  * Return: 0 on success, negative error code on failure
  */
-int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+		err("Failed to clear TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_seq() - Dump or set sequences for socket queues
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ * @set:	Set if true, dump if false
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_seq(struct ctx *c, struct tcp_tap_conn *conn, bool set)
 {
 	int v, s = conn->sock;
 	socklen_t vlen;
@@ -902,28 +920,124 @@ int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn)
 
 	vlen = sizeof(v);
 
-	v = TCP_SEND_QUEUE;
 	/* TODO: proper error management and prints */
-	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
-		return -errno;
 
-	if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd, &vlen))
+	v = TCP_SEND_QUEUE;
+	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
 		return -errno;
 
-	debug("Send queue sequence %u for socket %i", conn->sock_seq_snd, s);
+	if (set) {
+		if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd,
+			       vlen))
+			return -errno;
+		debug("Set send queue sequence for socket %i to %u",
+		      s, conn->sock_seq_snd);
+	} else {
+		if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd,
+			       &vlen))
+			return -errno;
+		debug("Dumped send queue sequence for socket %i: %u",
+		      s, conn->sock_seq_snd);
+	}
 
 	v = TCP_RECV_QUEUE;
 	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
 		return -errno;
 
-	if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv, &vlen))
+	if (set) {
+		if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv,
+			       vlen))
+			return -errno;
+		debug("Set receive queue sequence for socket %i to %u",
+		      s, conn->sock_seq_rcv);
+	} else {
+		if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv,
+			       &vlen))
+			return -errno;
+		debug("Dumped receive queue sequence for socket %i: %u",
+		      s, conn->sock_seq_rcv);
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+	const struct flowside *sockside = HOSTFLOW(conn);
+	struct sockaddr_in a;
+	int rc;
+
+	a = (struct sockaddr_in){ af, htons(sockside->oport), { 0 }, { 0 } };
+
+	if ((conn->sock = socket(af, SOCK_STREAM, IPPROTO_TCP)) < 0)
+		return -errno;
+
+	/* On the same host, source socket can be in TIME_WAIT */
+	setsockopt(conn->sock, SOL_SOCKET, SO_REUSEADDR,
+		   &((int){ 1 }), sizeof(int));
+
+	if (bind(conn->sock, (struct sockaddr *)&a, sizeof(a)) < 0) {
+		close(conn->sock);
+		conn->sock = -1;
 		return -errno;
+	}
 
-	debug("Receive queue sequence %u for socket %i", conn->sock_seq_rcv, s);
+	rc = tcp_flow_repair_on(c, conn);
+	if (rc) {
+		close(conn->sock);
+		conn->sock = -1;
+		return rc;
+	}
 
 	return 0;
 }
 
+/**
+ * tcp_flow_repair_connect() - Connect sockets in repair mode, then turn it off
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_connect(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	struct flowside *tgt = &conn->f.side[TGTSIDE];
+	struct tcp_repair_opt opts[2];
+
+	tcp_flow_repair_seq(c, conn, true);
+
+	flowside_connect(c, conn->sock, PIF_HOST, tgt);
+
+	/* FIXME: Fetch those with TCP_REPAIR_OPTIONS and store in migration
+	 * data. These hardcoded values just happen to be good enough.
+	 *
+	 * On top of these, to seamlessly restore the window, we also need to
+	 * dump and restore struct tcp_repair_window via TCP_REPAIR_WINDOW.
+	 */
+	opts[0].opt_code = TCPOPT_WINDOW;
+	opts[0].opt_val = 8 + (8 << 16);
+
+	opts[1].opt_code = TCPOPT_MAXSEG;
+	opts[1].opt_val = 65495;
+
+	setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS,
+		   opts, 2 * sizeof(struct tcp_repair_opt));
+
+	conn->in_epoll = 0;
+	conn->timer = -1;
+	tcp_epoll_ctl(c, conn);
+
+	return tcp_flow_repair_off(c, conn);
+}
+
 /**
  * tcp_fill_header() - Fill the TCP header fields for a given TCP segment.
  *
diff --git a/tcp_conn.h b/tcp_conn.h
index 0c3e197..3bf8837 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -144,7 +144,9 @@ extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
 
 bool tcp_flow_defer(const struct tcp_tap_conn *conn);
 int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
-int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_seq(struct ctx *c, struct tcp_tap_conn *conn, bool set);
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_connect(struct ctx *c, struct tcp_tap_conn *conn);
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
 void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);
-- 
@@ -144,7 +144,9 @@ extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
 
 bool tcp_flow_defer(const struct tcp_tap_conn *conn);
 int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
-int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_seq(struct ctx *c, struct tcp_tap_conn *conn, bool set);
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_connect(struct ctx *c, struct tcp_tap_conn *conn);
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
 void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);
-- 
2.43.0


  parent reply	other threads:[~2025-02-04  0:47 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-04  0:47 [PATCH v4 0/8] Draft, incomplete series introducing state migration Stefano Brivio
2025-02-04  0:47 ` [PATCH v4 1/8] flow_table: Use size in extern declaration for flowtab, export hash table Stefano Brivio
2025-02-04  0:47 ` [PATCH v4 2/8] Introduce facilities for guest migration on top of vhost-user infrastructure Stefano Brivio
2025-02-04  0:47 ` [PATCH v4 3/8] migrate: Make more handling common rather than vhost-user specific Stefano Brivio
2025-02-04  0:47 ` [PATCH v4 4/8] migrate: Don't handle the migration channel through epoll Stefano Brivio
2025-02-04  0:47 ` [PATCH v4 5/8] Add interfaces and configuration bits for passt-repair Stefano Brivio
2025-02-04  0:47 ` [PATCH v4 6/8] flow, tcp: Basic pre-migration source handler to dump sequence numbers Stefano Brivio
2025-02-04  3:43   ` David Gibson
2025-02-04  6:44     ` Stefano Brivio
2025-02-05  0:58       ` David Gibson
2025-02-04  0:47 ` [PATCH v4 7/8] vhost_user: Make source quit after reporting migration state Stefano Brivio
2025-02-04  0:47 ` Stefano Brivio [this message]
2025-02-04  6:01 ` [PATCH v4 0/8] Draft, incomplete series introducing state migration David Gibson
2025-02-04  6:48   ` Stefano Brivio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250204004745.97854-9-sbrivio@redhat.com \
    --to=sbrivio@redhat.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).