[PATCH v13 5/6] migrate: Migrate TCP flows

public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed

From: Stefano Brivio <sbrivio@redhat.com>
To: passt-dev@passt.top
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v13 5/6] migrate: Migrate TCP flows
Date: Sun,  9 Feb 2025 23:20:04 +0100	[thread overview]
Message-ID: <20250209222005.1640077-6-sbrivio@redhat.com> (raw)
In-Reply-To: <20250209222005.1640077-1-sbrivio@redhat.com>

This implements flow preparation on the source, transfer of data with
a format roughly inspired by struct tcp_tap_conn, and flow insertion
on the target, with all the appropriate window options, window
scaling, MSS, etc.

The target side is rather convoluted because we first need to create
sockets and switch them to repair mode, before we can apply options
that are *not* stored in the flow table. However, we don't want to
request repair mode for sockets one by one. So we need to do this in
several steps.

[dwg: Assorted cleanups]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 flow.c     | 248 +++++++++++++++++
 flow.h     |   8 +
 migrate.c  |  19 ++
 passt.c    |   6 +-
 repair.c   |   1 -
 tcp.c      | 789 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 tcp_conn.h |  95 +++++++
 7 files changed, 1162 insertions(+), 4 deletions(-)

diff --git a/flow.c b/flow.c
index a6fe6d1..51f8c62 100644
--- a/flow.c
+++ b/flow.c
@@ -19,6 +19,7 @@
 #include "inany.h"
 #include "flow.h"
 #include "flow_table.h"
+#include "repair.h"
 
 const char *flow_state_str[] = {
 	[FLOW_STATE_FREE]	= "FREE",
@@ -52,6 +53,26 @@ const uint8_t flow_proto[] = {
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
 
+#define foreach_flow(i, flow, bound)					\
+	for ((i) = 0, (flow) = &flowtab[(i)];				\
+	     (i) < (bound);						\
+	     (i)++, (flow) = &flowtab[(i)])				\
+		if ((flow)->f.state == FLOW_STATE_FREE)			\
+			(i) += (flow)->free.n - 1;			\
+		else
+
+#define foreach_active_flow(i, flow, bound)				\
+	foreach_flow((i), (flow), (bound))				\
+		if ((flow)->f.state != FLOW_STATE_ACTIVE)		\
+			continue;					\
+		else
+
+#define foreach_tcp_flow(i, flow, bound)				\
+	foreach_active_flow((i), (flow), (bound))			\
+		if ((flow)->f.type != FLOW_TCP)				\
+			continue;					\
+		else
+
 /* Global Flow Table */
 
 /**
@@ -874,6 +895,233 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 	*last_next = FLOW_MAX;
 }
 
+/**
+ * flow_migrate_source_rollback() - Disable repair mode, return failure
+ * @c:		Execution context
+ * @max_flow:	Maximum index of affected flows
+ * @ret:	Negative error code
+ *
+ * Return: @ret
+ */
+static int flow_migrate_source_rollback(struct ctx *c, unsigned max_flow,
+					int ret)
+{
+	union flow *flow;
+	unsigned i;
+
+	debug("...roll back migration");
+
+	foreach_tcp_flow(i, flow, max_flow)
+		tcp_flow_repair_off(c, &flow->tcp);
+
+	repair_flush(c);
+
+	return ret;
+}
+
+/**
+ * flow_migrate_repair_all() - Turn repair mode on or off for all flows
+ * @c:		Execution context
+ * @enable:	Switch repair mode on if set, off otherwise
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int flow_migrate_repair_all(struct ctx *c, bool enable)
+{
+	union flow *flow;
+	unsigned i;
+	int rc;
+
+	foreach_tcp_flow(i, flow, FLOW_MAX) {
+		if (enable)
+			rc = tcp_flow_repair_on(c, &flow->tcp);
+		else
+			rc = tcp_flow_repair_off(c, &flow->tcp);
+
+		if (rc) {
+			debug("Can't %s repair mode: %s",
+			      enable ? "enable" : "disable", strerror_(-rc));
+			return flow_migrate_source_rollback(c, i, rc);
+		}
+	}
+
+	if ((rc = repair_flush(c))) {
+		debug("Can't %s repair mode: %s",
+		      enable ? "enable" : "disable", strerror_(-rc));
+		return flow_migrate_source_rollback(c, i, rc);
+	}
+
+	return 0;
+}
+
+/**
+ * flow_migrate_source_early() - Early tasks: shrink (RFC 7323 2.2) TCP windows
+ * @c:		Execution context
+ * @stage:	Migration stage information, unused
+ * @fd:		Migration file descriptor, unused
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
+			      int fd)
+{
+	union flow *flow;
+	unsigned i;
+	int rc;
+
+	(void)stage;
+	(void)fd;
+
+	/* We need repair mode to dump and set (some) window parameters */
+	if ((rc = flow_migrate_repair_all(c, true)))
+		return -rc;
+
+	foreach_tcp_flow(i, flow, FLOW_MAX) {
+		if ((rc = tcp_flow_migrate_shrink_window(i, &flow->tcp))) {
+			err("Shrinking window, flow %u: %s", i, strerror_(-rc));
+			return flow_migrate_source_rollback(c, i, -rc);
+		}
+	}
+
+	/* Now send window updates. We'll flip repair mode back on in a bit */
+	if ((rc = flow_migrate_repair_all(c, false)))
+		return -rc;
+
+	return 0;
+}
+
+/**
+ * flow_migrate_source_pre() - Prepare flows for migration: enable repair mode
+ * @c:		Execution context
+ * @stage:	Migration stage information (unused)
+ * @fd:		Migration file descriptor (unused)
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
+			    int fd)
+{
+	int rc;
+
+	(void)stage;
+	(void)fd;
+
+	if ((rc = flow_migrate_repair_all(c, true)))
+		return -rc;
+
+	return 0;
+}
+
+/**
+ * flow_migrate_source() - Dump all the remaining information and send data
+ * @c:		Execution context (unused)
+ * @stage:	Migration stage information (unused)
+ * @fd:		Migration file descriptor
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
+			int fd)
+{
+	uint32_t count = 0;
+	union flow *flow;
+	unsigned i;
+	int rc;
+
+	(void)c;
+	(void)stage;
+
+	foreach_tcp_flow(i, flow, FLOW_MAX)
+		count++;
+
+	count = htonl(count);
+	if ((rc = write_all_buf(fd, &count, sizeof(count)))) {
+		rc = errno;
+		err_perror("Can't send flow count (%u)", ntohl(count));
+		return flow_migrate_source_rollback(c, FLOW_MAX, rc);
+	}
+
+	debug("Sending %u flows", ntohl(count));
+
+	/* Dump and send information that can be stored in the flow table */
+	foreach_tcp_flow(i, flow, FLOW_MAX) {
+		if ((rc = tcp_flow_migrate_source(fd, &flow->tcp))) {
+			err("Can't send data, flow %u: %s", i, strerror_(-rc));
+			return flow_migrate_source_rollback(c, FLOW_MAX, -rc);
+		}
+	}
+
+	/* And then "extended" data (including window data we saved previously):
+	 * the target needs to set repair mode on sockets before it can set
+	 * this stuff, but it needs sockets (and flows) for that.
+	 *
+	 * This also closes sockets so that the target can start connecting
+	 * theirs: you can't sendmsg() to queues (using the socket) if the
+	 * socket is not connected (EPIPE), not even in repair mode. And the
+	 * target needs to restore queues now because we're sending the data.
+	 *
+	 * So, no rollback here, just try as hard as we can.
+	 */
+	foreach_tcp_flow(i, flow, FLOW_MAX) {
+		if ((rc = tcp_flow_migrate_source_ext(fd, i, &flow->tcp)))
+			err("Extended data for flow %u: %s", i, strerror_(-rc));
+	}
+
+	return 0;
+}
+
+/**
+ * flow_migrate_target() - Receive flows and insert in flow table
+ * @c:		Execution context
+ * @stage:	Migration stage information (unused)
+ * @fd:		Migration file descriptor
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
+			int fd)
+{
+	uint32_t count;
+	unsigned i;
+	int rc;
+
+	(void)stage;
+
+	if (read_all_buf(fd, &count, sizeof(count)))
+		return errno;
+
+	count = ntohl(count);
+	debug("Receiving %u flows", count);
+
+	if ((rc = flow_migrate_repair_all(c, true)))
+		return -rc;
+
+	repair_flush(c);
+
+	/* TODO: flow header with type, instead? */
+	for (i = 0; i < count; i++) {
+		rc = tcp_flow_migrate_target(c, fd);
+		if (rc) {
+			debug("Bad target data for flow %u: %s, abort",
+			      i, strerror_(-rc));
+			return -rc;
+		}
+	}
+
+	repair_flush(c);
+
+	for (i = 0; i < count; i++) {
+		rc = tcp_flow_migrate_target_ext(c, flowtab + i, fd);
+		if (rc) {
+			debug("Bad target extended data for flow %u: %s, abort",
+			      i, strerror_(-rc));
+			return -rc;
+		}
+	}
+
+	return 0;
+}
+
 /**
  * flow_init() - Initialise flow related data structures
  */
diff --git a/flow.h b/flow.h
index 24ba3ef..675726e 100644
--- a/flow.h
+++ b/flow.h
@@ -249,6 +249,14 @@ union flow;
 
 void flow_init(void);
 void flow_defer_handler(const struct ctx *c, const struct timespec *now);
+int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
+			      int fd);
+int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
+			    int fd);
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
+			int fd);
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
+			int fd);
 
 void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
 	__attribute__((format(printf, 3, 4)));
diff --git a/migrate.c b/migrate.c
index 1c59016..c5c6663 100644
--- a/migrate.c
+++ b/migrate.c
@@ -98,11 +98,30 @@ static int seen_addrs_target_v1(struct ctx *c,
 
 /* Stages for version 1 */
 static const struct migrate_stage stages_v1[] = {
+	/* FIXME: With this step, close() in tcp_flow_migrate_source_ext()
+	 * *sometimes* closes the connection for real.
+	 */
+/*	{
+		.name = "shrink TCP windows",
+		.source = flow_migrate_source_early,
+		.target = NULL,
+	},
+*/
 	{
 		.name = "observed addresses",
 		.source = seen_addrs_source_v1,
 		.target = seen_addrs_target_v1,
 	},
+	{
+		.name = "prepare flows",
+		.source = flow_migrate_source_pre,
+		.target = NULL,
+	},
+	{
+		.name = "transfer flows",
+		.source = flow_migrate_source,
+		.target = flow_migrate_target,
+	},
 	{ 0 },
 };
 
diff --git a/passt.c b/passt.c
index 6f9fb4d..68d1a28 100644
--- a/passt.c
+++ b/passt.c
@@ -223,9 +223,6 @@ int main(int argc, char **argv)
 		if (sigaction(SIGCHLD, &sa, NULL))
 			die_perror("Couldn't install signal handlers");
 
-		if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
-			die_perror("Couldn't set disposition for SIGPIPE");
-
 		c.mode = MODE_PASTA;
 	} else if (strstr(name, "passt")) {
 		c.mode = MODE_PASST;
@@ -233,6 +230,9 @@ int main(int argc, char **argv)
 		_exit(EXIT_FAILURE);
 	}
 
+	if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
+		die_perror("Couldn't set disposition for SIGPIPE");
+
 	madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
 
 	c.epollfd = epoll_create1(EPOLL_CLOEXEC);
diff --git a/repair.c b/repair.c
index 784b994..da85edb 100644
--- a/repair.c
+++ b/repair.c
@@ -190,7 +190,6 @@ int repair_flush(struct ctx *c)
  *
  * Return: 0 on success, negative error code on failure
  */
-/* cppcheck-suppress unusedFunction */
 int repair_set(struct ctx *c, int s, int cmd)
 {
 	int rc;
diff --git a/tcp.c b/tcp.c
index af6bd95..78db64f 100644
--- a/tcp.c
+++ b/tcp.c
@@ -280,6 +280,7 @@
 #include <stddef.h>
 #include <string.h>
 #include <sys/epoll.h>
+#include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/timerfd.h>
 #include <sys/types.h>
@@ -287,6 +288,8 @@
 #include <time.h>
 #include <arpa/inet.h>
 
+#include <linux/sockios.h>
+
 #include "checksum.h"
 #include "util.h"
 #include "iov.h"
@@ -299,6 +302,7 @@
 #include "log.h"
 #include "inany.h"
 #include "flow.h"
+#include "repair.h"
 #include "linux_dep.h"
 
 #include "flow_table.h"
@@ -326,6 +330,19 @@
 	 ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
 #define CONN_HAS(conn, set)	(((conn)->events & (set)) == (set))
 
+/* Buffers to migrate pending data from send and receive queues. No, they don't
+ * use memory if we don't use them. And we're going away after this, so splurge.
+ */
+#define TCP_MIGRATE_SND_QUEUE_MAX	(64 << 20)
+#define TCP_MIGRATE_RCV_QUEUE_MAX	(64 << 20)
+uint8_t tcp_migrate_snd_queue		[TCP_MIGRATE_SND_QUEUE_MAX];
+uint8_t tcp_migrate_rcv_queue		[TCP_MIGRATE_RCV_QUEUE_MAX];
+
+#define TCP_MIGRATE_RESTORE_CHUNK_MIN	1024 /* Try smaller when above this */
+
+/* "Extended" data (not stored in the flow table) for TCP flow migration */
+static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
+
 static const char *tcp_event_str[] __attribute((__unused__)) = {
 	"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
 
@@ -2645,3 +2662,775 @@ void tcp_timer(struct ctx *c, const struct timespec *now)
 	if (c->mode == MODE_PASTA)
 		tcp_splice_refill(c);
 }
+
+/**
+ * tcp_flow_repair_on() - Enable repair mode for a single TCP flow
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON)))
+		err("Failed to set TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_off() - Clear repair mode for a single TCP flow
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+	int rc = 0;
+
+	if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+		err("Failed to clear TCP_REPAIR");
+
+	return rc;
+}
+
+/**
+ * tcp_flow_repair_queues() - Read or write socket queues, or send unsent data
+ * @s:			Socket
+ * @sndbuf:		Send queue buffer read or written/sent depending on @set
+ * @sndlen:		Length of send queue buffer to set, network order
+ * @notsentlen:		Length of not sent data, non-zero to actually _send_
+ * @rcvbuf:		Receive queue buffer, read or written depending on @set
+ * @rcvlen:		Length of receive queue buffer to set, network order
+ * @set:		Set or send (unsent data only) if true, dump if false
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_repair_queues(int s,
+				  uint8_t *sndbuf,
+				  uint32_t *sndlen, uint32_t *notsentlen,
+				  uint8_t *rcvbuf, uint32_t *rcvlen,
+				  bool set)
+{
+	ssize_t rc;
+	int v;
+
+	if (set && rcvbuf) { /* FIXME: can't check notsentlen, rework this */
+		v = TCP_SEND_QUEUE;
+		if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) {
+			rc = -errno;
+			err_perror("Selecting TCP_SEND_QUEUE on socket %i", s);
+			return rc;
+		}
+	}
+
+	if (set) {
+		size_t chunk;
+		uint8_t *p;
+
+		*sndlen = ntohl(*sndlen);
+		debug("Writing socket %i send queue: %u bytes", s, *sndlen);
+		p = sndbuf;
+		chunk = *sndlen;
+		while (*sndlen > 0) {
+			rc = send(s, p, MIN(*sndlen, chunk), 0);
+
+			if (rc < 0) {
+				if ((errno == ENOBUFS || errno == ENOMEM) &&
+				    chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) {
+					chunk /= 2;
+					continue;
+				}
+
+				rc = -errno;
+				err_perror("Can't write socket %i send queue",
+					   s);
+				return rc;
+			}
+
+			*sndlen -= rc;
+			p += rc;
+		}
+
+		*notsentlen = ntohl(*notsentlen);
+		debug("Sending socket %i unsent queue: %u bytes", s,
+		      *notsentlen);
+		p = sndbuf;
+		chunk = *notsentlen;
+		while (*notsentlen > 0) {
+			rc = send(s, p, MIN(*notsentlen, chunk), 0);
+
+			if (rc < 0) {
+				if ((errno == ENOBUFS || errno == ENOMEM) &&
+				    chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) {
+					chunk /= 2;
+					continue;
+				}
+
+				rc = -errno;
+				err_perror("Can't send socket %i unsent queue",
+					   s);
+				return rc;
+			}
+
+			*notsentlen -= rc;
+			p += rc;
+		}
+	} else {
+		rc = ioctl(s, SIOCOUTQ, sndlen);
+		if (rc < 0) {
+			rc = -errno;
+			err_perror("Getting send queue size for socket %i", s);
+			return rc;
+		}
+
+		rc = ioctl(s, SIOCOUTQNSD, notsentlen);
+		if (rc < 0) {
+			rc = -errno;
+			err_perror("Getting not sent count for socket %i", s);
+			return rc;
+		}
+
+		/* TODO: Skip "FIN" byte in queue if present */
+
+		rc = recv(s, sndbuf, MIN(*sndlen, TCP_MIGRATE_SND_QUEUE_MAX),
+			  MSG_PEEK);
+		if (rc < 0 && errno != EAGAIN) { /* EAGAIN means empty */
+			rc = -errno;
+			err_perror("Can't read send queue for socket %i", s);
+			return rc;
+		}
+
+		rc = MAX(0, rc);
+		debug("Read socket %i send queue: %zi (%u not sent)", s, rc,
+		      *notsentlen);
+
+		*sndlen = htonl(rc);
+		*notsentlen = htonl(*notsentlen);
+	}
+
+	if (!rcvbuf)
+		return 0;
+
+	v = TCP_RECV_QUEUE;
+	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) {
+		rc = -errno;
+		err_perror("Selecting TCP_RECV_QUEUE for socket %i", s);
+		return rc;
+	}
+
+	if (set) {
+		uint8_t *p;
+		size_t chunk;
+
+		*rcvlen = ntohl(*rcvlen);
+		debug("Writing socket %i receive queue: %u bytes", s, *rcvlen);
+		p = rcvbuf;
+		chunk = *rcvlen;
+		while (*rcvlen > 0) {
+			rc = send(s, p, MIN(*rcvlen, chunk), 0);
+			if (rc < 0) {
+				if ((errno == ENOBUFS || errno == ENOMEM) &&
+				    chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) {
+					chunk /= 2;
+					continue;
+				}
+
+				rc = -errno;
+				err_perror("Can't send socket %i receive queue",
+					   s);
+				return rc;
+			}
+
+			*rcvlen -= rc;
+			p += rc;
+		}
+	} else {
+		if (ioctl(s, SIOCINQ, rcvlen) < 0) {
+			rc = -errno;
+			err_perror("Get receive queue size for socket %i", s);
+			return rc;
+		}
+
+		/* TODO: Skip "FIN" byte in queue if present */
+
+		rc = recv(s, rcvbuf, MIN(*rcvlen, TCP_MIGRATE_RCV_QUEUE_MAX),
+					 MSG_PEEK);
+		if (rc < 0 && errno != EAGAIN) { /* EAGAIN means empty */
+			rc = -errno;
+			err_perror("Can't read receive queue for socket %i", s);
+			return rc;
+		}
+
+		rc = MAX(0, rc);
+		*rcvlen = htonl(rc);
+		debug("Read socket %i receive queue: %zi bytes", s, rc);
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_seq() - Dump or set sequences
+ * @s:		Socket
+ * @snd_seq:	Send sequence, set on return if @set == false, network order
+ * @rcv_seq:	Receive sequence, set on return if @set == false, network order
+ * @set:	Set if true, dump if false
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_seq(int s, uint32_t *snd_seq, uint32_t *rcv_seq,
+			       bool set)
+{
+	socklen_t vlen = sizeof(uint32_t);
+	ssize_t rc;
+	int v;
+
+	v = TCP_SEND_QUEUE;
+	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) {
+		rc = -errno;
+		err_perror("Selecting TCP_SEND_QUEUE on socket %i", s);
+		return rc;
+	}
+
+	if (set) {
+		*snd_seq = ntohl(*snd_seq);
+		if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, snd_seq, vlen)) {
+			rc = -errno;
+			err_perror("Setting send sequence for socket %i", s);
+			return rc;
+		}
+		debug("Set send sequence for socket %i to %u", s, *snd_seq);
+	} else {
+		if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, snd_seq, &vlen)) {
+			rc = -errno;
+			err_perror("Dumping send sequence for socket %i", s);
+			return rc;
+		}
+		debug("Dumped send sequence for socket %i: %u", s, *snd_seq);
+		*snd_seq = htonl(*snd_seq);
+	}
+
+	v = TCP_RECV_QUEUE;
+	if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v))) {
+		rc = -errno;
+		err_perror("Selecting TCP_RECV_QUEUE for socket %i", s);
+		return rc;
+	}
+
+	if (set) {
+		*rcv_seq = ntohl(*rcv_seq);
+		if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, rcv_seq, vlen)) {
+			rc = -errno;
+			err_perror("Setting receive sequence %u for socket %i",
+				   *rcv_seq, s);
+			return rc;
+		}
+		debug("Set receive sequence for socket %i to %u", s, *rcv_seq);
+	} else {
+		if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, rcv_seq, &vlen)) {
+			rc = -errno;
+			err_perror("Dumping receive sequence for socket %i", s);
+			return rc;
+		}
+		debug("Dumped receive sequence for socket %i: %u", s, *rcv_seq);
+		*rcv_seq = htonl(*rcv_seq);
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_opt() - Dump or set repair "options" (MSS and window scale)
+ * @s:			Socket
+ * @snd_wscale:		Window scaling factor, send, network order
+ * @rcv_wscale:		Window scaling factor, receive, network order
+ * @mss:		Maximum Segment Size, socket side, network order
+ * @set:		Set if true, dump if false
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_opt(int s, uint8_t *snd_wscale, uint8_t *rcv_wscale,
+			uint32_t *mss, bool set)
+{
+	struct tcp_info_linux tinfo;
+	struct tcp_repair_opt opts[2];
+	socklen_t sl;
+	int rc;
+
+	opts[0].opt_code = TCPOPT_WINDOW;
+	opts[1].opt_code = TCPOPT_MAXSEG;
+
+	if (set) {
+		*mss = ntohl(*mss);
+		debug("Setting repair options for socket %i:", s);
+		opts[0].opt_val = *snd_wscale + (*rcv_wscale << 16);
+		opts[1].opt_val = *mss;
+		debug("  window scale send %u, receive %u, MSS: %u",
+		      *snd_wscale, *rcv_wscale, *mss);
+
+		sl = sizeof(opts);
+		if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+			rc = -errno;
+			err_perror("Setting repair options for socket %i", s);
+			return rc;
+		}
+
+		sl = sizeof(*mss);
+		if (setsockopt(s, SOL_TCP, TCP_MAXSEG, mss, sl))
+			debug_perror("Setting MSS for socket %i", s);
+	} else {
+		sl = sizeof(tinfo);
+		if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+			rc = -errno;
+			err_perror("Querying TCP_INFO for socket %i", s);
+			return rc;
+		}
+
+		*snd_wscale = tinfo.tcpi_snd_wscale;
+		*rcv_wscale = tinfo.tcpi_rcv_wscale;
+
+		/* TCP_INFO MSS is just the current value: ask explicitly */
+		sl = sizeof(*mss);
+		if (getsockopt(s, SOL_TCP, TCP_MAXSEG, mss, &sl)) {
+			rc = -errno;
+			err_perror("Getting MSS for socket %i", s);
+			return rc;
+		}
+		*mss = htonl(*mss);
+
+		debug("Got repair options for socket %i:", s);
+		debug("  window scale send %u, receive %u, MSS: %u",
+		      *snd_wscale, *rcv_wscale, ntohl(*mss));
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_wnd() - Dump or set window parameters
+ * @snd_wl1:		Next sequence used in window probe (next sequence - 1)
+ * @snd_wnd:		Socket-side sending window, network order
+ * @max_window:		Window clamp, network order
+ * @rcv_wnd:		Socket-side receive window, network order
+ * @rcv_wup:		rcv_nxt on last window update sent, network order
+ * @wnd_out:		If NULL, set parameters, if given, get and return all
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_wnd(int s, uint32_t *snd_wl1, uint32_t *snd_wnd,
+			       uint32_t *max_window, uint32_t *rcv_wnd,
+			       uint32_t *rcv_wup,
+			       struct tcp_repair_window *wnd_out)
+{
+	struct tcp_repair_window wnd_copy, *wnd = wnd_out ? wnd_out : &wnd_copy;
+	socklen_t sl = sizeof(*wnd);
+	int rc;
+
+	if (!wnd_out) {
+		wnd->snd_wl1	= ntohl(*snd_wl1);
+		wnd->snd_wnd	= ntohl(*snd_wnd);
+		wnd->max_window	= ntohl(*max_window);
+		wnd->rcv_wnd	= ntohl(*rcv_wnd);
+		wnd->rcv_wup	= ntohl(*rcv_wup);
+
+		if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, wnd, sl)) {
+			rc = -errno;
+			err_perror("Setting window repair data, socket %i", s);
+			return rc;
+		}
+	} else {
+		if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, wnd, &sl)) {
+			rc = -errno;
+			err_perror("Getting window repair data, socket %i", s);
+			return rc;
+		}
+
+		*snd_wl1	= htonl(wnd->snd_wl1);
+		*snd_wnd	= htonl(wnd->snd_wnd);
+		*max_window	= htonl(wnd->max_window);
+		*rcv_wnd	= htonl(wnd->rcv_wnd);
+		*rcv_wup	= htonl(wnd->rcv_wup);
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_shrink_window() - Dump window data, decrease socket window
+ * @fidx:	Flow index
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_shrink_window(int fidx, const struct tcp_tap_conn *conn)
+{
+	struct tcp_tap_transfer_ext *t = &migrate_ext[fidx];
+	struct tcp_repair_window wnd;
+	socklen_t sl = sizeof(wnd);
+	int s = conn->sock;
+
+	if (setsockopt(s, SOL_SOCKET, SO_RCVBUF, &((int){ 0 }), sizeof(int)))
+		debug("TCP: failed to set SO_RCVBUF to minimum value");
+
+	/* Dump window data as it is for the target, before touching stuff */
+	tcp_flow_repair_wnd(s, &t->sock_snd_wl1, &t->sock_snd_wnd,
+			    &t->sock_max_window, &t->sock_rcv_wnd,
+			    &t->sock_rcv_wup, &wnd);
+
+	wnd.rcv_wnd = 0;
+
+	if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sl))
+		debug_perror("Setting window repair data, socket %i", s);
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_source() - Send data (flow table part) for a single flow
+ * @c:		Execution context
+ * @fd:		Descriptor for state migration
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_source(int fd, const struct tcp_tap_conn *conn)
+{
+	struct tcp_tap_transfer t = {
+		.retrans		= conn->retrans,
+		.ws_from_tap		= conn->ws_from_tap,
+		.ws_to_tap		= conn->ws_to_tap,
+		.events			= conn->events,
+
+		.tap_mss		= htonl(MSS_GET(conn)),
+
+		.sndbuf			= htonl(conn->sndbuf),
+
+		.flags			= conn->flags,
+		.seq_dup_ack_approx	= conn->seq_dup_ack_approx,
+
+		.wnd_from_tap		= htons(conn->wnd_from_tap),
+		.wnd_to_tap		= htons(conn->wnd_to_tap),
+
+		.seq_to_tap		= htonl(conn->seq_to_tap),
+		.seq_ack_from_tap	= htonl(conn->seq_ack_from_tap),
+		.seq_from_tap		= htonl(conn->seq_from_tap),
+		.seq_ack_to_tap		= htonl(conn->seq_ack_to_tap),
+		.seq_init_from_tap	= htonl(conn->seq_init_from_tap),
+	};
+	int rc;
+
+	memcpy(&t.pif, conn->f.pif, sizeof(t.pif));
+	memcpy(&t.side, conn->f.side, sizeof(t.side));
+
+	if (write_all_buf(fd, &t, sizeof(t))) {
+		rc = -errno;
+		err_perror("Failed to write migration data for socket %i",
+			   conn->sock);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
+ * @fd:		Descriptor for state migration
+ * @fidx:	Flow index
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_source_ext(int fd, int fidx,
+				const struct tcp_tap_conn *conn)
+{
+	struct tcp_tap_transfer_ext *t = &migrate_ext[fidx];
+	struct tcp_repair_window wnd;
+	int s = conn->sock;
+	int rc;
+
+	/* FIXME: Reenable dump in tcp_flow_migrate_shrink_window() */
+	tcp_flow_repair_wnd(s, &t->sock_snd_wl1, &t->sock_snd_wnd,
+			    &t->sock_max_window, &t->sock_rcv_wnd,
+			    &t->sock_rcv_wup, &wnd);
+
+	rc = tcp_flow_repair_seq(s, &t->sock_seq_snd, &t->sock_seq_rcv, false);
+	if (rc) {
+		err("Failed to get sequences on source for socket %i: %s",
+		    s, strerror_(-rc));
+		return rc;
+	}
+
+	tcp_flow_repair_opt(s, &t->snd_wscale, &t->rcv_wscale, &t->sock_mss,
+			    false);
+
+	/* Dump receive queue as late as possible */
+	rc = tcp_flow_repair_queues(s, tcp_migrate_snd_queue,
+				    &t->sndlen, &t->notsentlen,
+				    tcp_migrate_rcv_queue, &t->rcvlen, false);
+	if (rc) {
+		err("Failed to dump queues on source for socket %i: %s",
+		    s, strerror_(-rc));
+		return rc;
+	}
+
+	if (ntohl(t->sndlen) > TCP_MIGRATE_SND_QUEUE_MAX ||
+	    ntohl(t->notsentlen) > ntohl(t->sndlen) ||
+	    ntohl(t->rcvlen) > TCP_MIGRATE_RCV_QUEUE_MAX) {
+		err("Bad data queues length, socket %i, send: %u, receive: %u",
+		    s, ntohl(t->sndlen), ntohl(t->rcvlen));
+		return -EINVAL;
+	}
+
+	/* FIXME: it's either this or flow_migrate_source_early(), why? */
+	close(s);
+
+	if (write_all_buf(fd, t, sizeof(*t))) {
+		rc = -errno;
+		err_perror("Failed to write extended data for socket %i", s);
+		return rc;
+	}
+
+	if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndlen))) {
+		rc = -errno;
+		err_perror("Failed to write send queue data for socket %i", s);
+		return rc;
+	}
+
+	if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvlen))) {
+		rc = -errno;
+		err_perror("Failed to write receive queue data for socket %i",
+			   s);
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+	const struct flowside *sockside = HOSTFLOW(conn);
+	union sockaddr_inany a;
+	socklen_t sl;
+	int s, rc;
+
+	pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+	if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+				 IPPROTO_TCP)) < 0) {
+		rc = -errno;
+		err_perror("Failed to create socket for migrated flow");
+		return rc;
+	}
+	s = conn->sock;
+
+	setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int));
+
+	tcp_sock_set_bufsize(c, s);
+	tcp_sock_set_nodelay(s);
+
+	if (bind(s, &a.sa, sizeof(a)) < 0) {
+		rc = -errno;
+		err_perror("Failed to bind socket %i for migrated flow", s);
+		close(s);
+		conn->sock = -1;
+		return rc;
+	}
+
+	rc = tcp_flow_repair_on(c, conn);
+	if (rc) {
+		close(s);
+		conn->sock = -1;
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
+ * @c:		Execution context
+ * @conn:	Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_connect(const struct ctx *c,
+				   struct tcp_tap_conn *conn)
+{
+	const struct flowside *tgt = HOSTFLOW(conn);
+	int rc;
+
+	rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
+	if (rc) {
+		rc = -errno;
+		err_perror("Failed to connect migrated socket %i", conn->sock);
+		return rc;
+	}
+
+	conn->in_epoll = 0;
+	conn->timer = -1;
+	if ((rc = tcp_epoll_ctl(c, conn))) {
+		debug("Failed to subscribe to epoll for migrated socket %i: %s",
+		      conn->sock, strerror_(-rc));
+	}
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert
+ * @c:		Execution context
+ * @fd:		Descriptor for state migration
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_target(struct ctx *c, int fd)
+{
+	struct tcp_tap_transfer t;
+	struct tcp_tap_conn *conn;
+	union flow *flow;
+	int rc;
+
+	if (!(flow = flow_alloc())) {
+		err("Flow table full on migration target");
+		return -ENOMEM;
+	}
+
+	if (read_all_buf(fd, &t, sizeof(t))) {
+		err_perror("Failed to receive migration data");
+		return -errno;
+	}
+
+	flow->f.state = FLOW_STATE_TGT;
+	memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif));
+	memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
+	conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
+	conn->retrans			= t.retrans;
+	conn->ws_from_tap		= t.ws_from_tap;
+	conn->ws_to_tap			= t.ws_to_tap;
+	conn->events			= t.events;
+
+	conn->sndbuf			= htonl(t.sndbuf);
+
+	conn->flags			= t.flags;
+	conn->seq_dup_ack_approx	= t.seq_dup_ack_approx;
+
+	MSS_SET(conn,			  ntohl(t.tap_mss));
+
+	conn->wnd_from_tap		= ntohs(t.wnd_from_tap);
+	conn->wnd_to_tap		= ntohs(t.wnd_to_tap);
+
+	conn->seq_to_tap		= ntohl(t.seq_to_tap);
+	conn->seq_ack_from_tap		= ntohl(t.seq_ack_from_tap);
+	conn->seq_from_tap		= ntohl(t.seq_from_tap);
+	conn->seq_ack_to_tap		= ntohl(t.seq_ack_to_tap);
+	conn->seq_init_from_tap		= ntohl(t.seq_init_from_tap);
+
+	if ((rc = tcp_flow_repair_socket(c, conn)))
+		return rc;
+
+	flow_hash_insert(c, TAP_SIDX(conn));
+	FLOW_ACTIVATE(conn);
+
+	return 0;
+}
+
+/**
+ * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect
+ * @c:		Execution context
+ * @flow:	Existing flow for this connection data
+ * @fd:		Descriptor for state migration
+ *
+ * Return: 0 on success, negative code on failure, but 0 on connection reset
+ */
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
+{
+	struct tcp_tap_conn *conn = &flow->tcp;
+	struct tcp_tap_transfer_ext t;
+	uint32_t peek_offset, len;
+	int s = conn->sock, rc;
+
+	if (read_all_buf(fd, &t, sizeof(t))) {
+		rc = -errno;
+		err_perror("Failed to read extended data for socket %i", s);
+		return rc;
+	}
+
+	if (ntohl(t.sndlen) > TCP_MIGRATE_SND_QUEUE_MAX ||
+	    ntohl(t.notsentlen) > ntohl(t.sndlen) ||
+	    ntohl(t.rcvlen) > TCP_MIGRATE_RCV_QUEUE_MAX) {
+		err("Bad data queues length, socket %i, send: %u, receive: %u",
+		    s, ntohl(t.sndlen), ntohl(t.rcvlen));
+		return -EINVAL;
+	}
+
+	if (read_all_buf(fd, tcp_migrate_snd_queue, ntohl(t.sndlen))) {
+		rc = -errno;
+		err_perror("Failed to read send queue data for socket %i", s);
+		return rc;
+	}
+
+	if (read_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t.rcvlen))) {
+		rc = -errno;
+		err_perror("Failed to read receive queue data for socket %i",
+			   s);
+		return rc;
+	}
+
+	if ((rc = tcp_flow_repair_seq(s, &t.sock_seq_snd, &t.sock_seq_rcv,
+				      true)))
+		return rc;
+
+	if ((rc = tcp_flow_repair_connect(c, conn)))
+		return rc;
+
+	if ((rc = tcp_flow_repair_opt(s, &t.snd_wscale, &t.rcv_wscale,
+				      &t.sock_mss, true)))
+		return rc;
+
+	if ((rc = tcp_flow_repair_queues(s,
+					 tcp_migrate_snd_queue, &t.sndlen,
+					 &((uint32_t){ 0 }), /* Sent only */
+					 tcp_migrate_rcv_queue, &t.rcvlen,
+					 true)))
+		debug_perror("Error while repairing queues for socket %i", s);
+
+	if ((rc = tcp_flow_repair_wnd(s, &t.sock_snd_wl1, &t.sock_snd_wnd,
+				      &t.sock_max_window, &t.sock_rcv_wnd,
+				      &t.sock_rcv_wup, NULL)))
+		debug_perror("Error while repairing window for socket %i", s);
+
+	tcp_flow_repair_off(c, conn);
+	repair_flush(c);	/* FIXME: batch this? */
+
+	/* Now the unsent part of send queue */
+	len = htonl(ntohl(t.sndlen) - ntohl(t.notsentlen));
+	if ((rc = tcp_flow_repair_queues(s,
+					 tcp_migrate_snd_queue + ntohl(len),
+					 &((uint32_t){ 0 }), &len,
+					 NULL, &((uint32_t){ 0 }),
+					 true)))
+		debug_perror("Sending unsent data for socket %i", s);
+
+	peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+	if (tcp_set_peek_offset(conn->sock, peek_offset))
+		tcp_rst(c, conn);
+
+	tcp_send_flag(c, conn, ACK);
+
+	return 0;
+}
diff --git a/tcp_conn.h b/tcp_conn.h
index d342680..b64e857 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -96,6 +96,89 @@ struct tcp_tap_conn {
 	uint32_t	seq_init_from_tap;
 };
 
+/**
+ * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
+ * @pif:		Interfaces for each side of the flow
+ * @side:		Addresses and ports for each side of the flow
+ * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
+ * @ws_from_tap:	Window scaling factor advertised from tap/guest
+ * @ws_to_tap:		Window scaling factor advertised to tap/guest
+ * @events:		Connection events, implying connection states
+ * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
+ * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
+ * @flags:		Connection flags representing internal attributes
+ * @seq_dup_ack_approx:	Last duplicate ACK number sent to tap
+ * @wnd_from_tap:	Last window size from tap, unscaled (as received)
+ * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
+ * @seq_to_tap:		Next sequence for packets to tap
+ * @seq_ack_from_tap:	Last ACK number received from tap
+ * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
+ * @seq_ack_to_tap:	Last ACK number sent to tap
+ * @seq_init_from_tap:	Initial sequence number from tap
+*/
+struct tcp_tap_transfer {
+	uint8_t		pif[SIDES];
+	struct flowside	side[SIDES];
+
+	uint8_t		retrans;
+	uint8_t		ws_from_tap;
+	uint8_t		ws_to_tap;
+	uint8_t		events;
+
+	uint32_t	tap_mss;
+
+	uint32_t	sndbuf;
+
+	uint8_t		flags;
+	uint8_t		seq_dup_ack_approx;
+
+	uint16_t	wnd_from_tap;
+	uint16_t	wnd_to_tap;
+
+	uint32_t	seq_to_tap;
+	uint32_t	seq_ack_from_tap;
+	uint32_t	seq_from_tap;
+	uint32_t	seq_ack_to_tap;
+	uint32_t	seq_init_from_tap;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
+/**
+ * struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order
+ * @sock_seq_snd:	Socket-side send sequence
+ * @sock_seq_rcv:	Socket-side receive sequence
+ * @sndlen:		Length of pending send queue (unacknowledged / not sent)
+ * @notsentlen:		Part of pending send queue that wasn't sent out yet
+ * @rcvlen:		Length of pending receive queue
+ * @sock_mss:		Socket-side MSS
+ * @sock_snd_wl1:	Next sequence used in window probe (next sequence - 1)
+ * @sock_snd_wnd:	Socket-side sending window
+ * @sock_max_window:	Window clamp
+ * @sock_rcv_wnd:	Socket-side receive window
+ * @sock_rcv_wup:	rcv_nxt on last window update sent
+ * @snd_wscale:		Window scaling factor, send
+ * @snd_wscale:		Window scaling factor, receive
+ */
+struct tcp_tap_transfer_ext {
+	uint32_t	sock_seq_snd;
+	uint32_t	sock_seq_rcv;
+
+	uint32_t	sndlen;
+	uint32_t	notsentlen;
+	uint32_t	rcvlen;
+
+	uint32_t	sock_mss;
+
+	/* We can't just use struct tcp_repair_window: we need network order */
+	uint32_t	sock_snd_wl1;
+	uint32_t	sock_snd_wnd;
+	uint32_t	sock_max_window;
+	uint32_t	sock_rcv_wnd;
+	uint32_t	sock_rcv_wup;
+
+	uint8_t		snd_wscale;
+	uint8_t		rcv_wscale;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
 /**
  * struct tcp_splice_conn - Descriptor for a spliced TCP connection
  * @f:			Generic flow information
@@ -140,6 +223,18 @@ extern int init_sock_pool4	[TCP_SOCK_POOL_SIZE];
 extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
 
 bool tcp_flow_defer(const struct tcp_tap_conn *conn);
+
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_shrink_window(int fidx, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(int fd, int fidx,
+				const struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_target(struct ctx *c, int fd);
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
+
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
 void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);
-- 
@@ -96,6 +96,89 @@ struct tcp_tap_conn {
 	uint32_t	seq_init_from_tap;
 };
 
+/**
+ * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
+ * @pif:		Interfaces for each side of the flow
+ * @side:		Addresses and ports for each side of the flow
+ * @retrans:		Number of retransmissions occurred due to ACK_TIMEOUT
+ * @ws_from_tap:	Window scaling factor advertised from tap/guest
+ * @ws_to_tap:		Window scaling factor advertised to tap/guest
+ * @events:		Connection events, implying connection states
+ * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
+ * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
+ * @flags:		Connection flags representing internal attributes
+ * @seq_dup_ack_approx:	Last duplicate ACK number sent to tap
+ * @wnd_from_tap:	Last window size from tap, unscaled (as received)
+ * @wnd_to_tap:		Sending window advertised to tap, unscaled (as sent)
+ * @seq_to_tap:		Next sequence for packets to tap
+ * @seq_ack_from_tap:	Last ACK number received from tap
+ * @seq_from_tap:	Next sequence for packets from tap (not actually sent)
+ * @seq_ack_to_tap:	Last ACK number sent to tap
+ * @seq_init_from_tap:	Initial sequence number from tap
+*/
+struct tcp_tap_transfer {
+	uint8_t		pif[SIDES];
+	struct flowside	side[SIDES];
+
+	uint8_t		retrans;
+	uint8_t		ws_from_tap;
+	uint8_t		ws_to_tap;
+	uint8_t		events;
+
+	uint32_t	tap_mss;
+
+	uint32_t	sndbuf;
+
+	uint8_t		flags;
+	uint8_t		seq_dup_ack_approx;
+
+	uint16_t	wnd_from_tap;
+	uint16_t	wnd_to_tap;
+
+	uint32_t	seq_to_tap;
+	uint32_t	seq_ack_from_tap;
+	uint32_t	seq_from_tap;
+	uint32_t	seq_ack_to_tap;
+	uint32_t	seq_init_from_tap;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
+/**
+ * struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order
+ * @sock_seq_snd:	Socket-side send sequence
+ * @sock_seq_rcv:	Socket-side receive sequence
+ * @sndlen:		Length of pending send queue (unacknowledged / not sent)
+ * @notsentlen:		Part of pending send queue that wasn't sent out yet
+ * @rcvlen:		Length of pending receive queue
+ * @sock_mss:		Socket-side MSS
+ * @sock_snd_wl1:	Next sequence used in window probe (next sequence - 1)
+ * @sock_snd_wnd:	Socket-side sending window
+ * @sock_max_window:	Window clamp
+ * @sock_rcv_wnd:	Socket-side receive window
+ * @sock_rcv_wup:	rcv_nxt on last window update sent
+ * @snd_wscale:		Window scaling factor, send
+ * @snd_wscale:		Window scaling factor, receive
+ */
+struct tcp_tap_transfer_ext {
+	uint32_t	sock_seq_snd;
+	uint32_t	sock_seq_rcv;
+
+	uint32_t	sndlen;
+	uint32_t	notsentlen;
+	uint32_t	rcvlen;
+
+	uint32_t	sock_mss;
+
+	/* We can't just use struct tcp_repair_window: we need network order */
+	uint32_t	sock_snd_wl1;
+	uint32_t	sock_snd_wnd;
+	uint32_t	sock_max_window;
+	uint32_t	sock_rcv_wnd;
+	uint32_t	sock_rcv_wup;
+
+	uint8_t		snd_wscale;
+	uint8_t		rcv_wscale;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
 /**
  * struct tcp_splice_conn - Descriptor for a spliced TCP connection
  * @f:			Generic flow information
@@ -140,6 +223,18 @@ extern int init_sock_pool4	[TCP_SOCK_POOL_SIZE];
 extern int init_sock_pool6	[TCP_SOCK_POOL_SIZE];
 
 bool tcp_flow_defer(const struct tcp_tap_conn *conn);
+
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_shrink_window(int fidx, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(int fd, int fidx,
+				const struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_target(struct ctx *c, int fd);
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
+
 bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
 void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
 int tcp_conn_pool_sock(int pool[]);
-- 
2.43.0

next prev parent reply	other threads:[~2025-02-09 22:20 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-09 22:19 [PATCH v13 0/6] State migration, kind of draft again Stefano Brivio
2025-02-09 22:20 ` [PATCH v13 1/6] migrate: Skeleton of live migration logic Stefano Brivio
2025-02-10  2:26   ` David Gibson
2025-02-09 22:20 ` [PATCH v13 2/6] migrate: Migrate guest observed addresses Stefano Brivio
2025-02-09 22:20 ` [PATCH v13 3/6] Add interfaces and configuration bits for passt-repair Stefano Brivio
2025-02-10  2:59   ` David Gibson
2025-02-10 15:54     ` Stefano Brivio
2025-02-09 22:20 ` [PATCH v13 4/6] vhost_user: Make source quit after reporting migration state Stefano Brivio
2025-02-10  3:43   ` David Gibson
2025-02-09 22:20 ` Stefano Brivio [this message]
2025-02-10  6:05   ` [PATCH v13 5/6] migrate: Migrate TCP flows David Gibson
2025-02-10  9:51     ` Stefano Brivio
2025-02-10 15:54       ` Stefano Brivio
2025-02-09 22:20 ` [PATCH v13 6/6] test: Add migration tests Stefano Brivio

find likely ancestor, descendant, or conflicting patches for this message:
dfblob:a6fe6d1 dfblob:24ba3ef dfblob:1c59016 dfblob:6f9fb4d
dfblob:784b994 dfblob:af6bd95 dfblob:d342680 dfblob:51f8c62
dfblob:675726e dfblob:c5c6663 dfblob:68d1a28 dfblob:da85edb
dfblob:78db64f dfblob:b64e857
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250209222005.1640077-6-sbrivio@redhat.com \
    --to=sbrivio@redhat.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).