* [PATCH v10 01/10] debug: Add tcpdump to mbuto.img
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
@ 2025-02-06 5:49 ` David Gibson
2025-02-06 9:14 ` Stefano Brivio
2025-02-06 5:49 ` [PATCH v10 02/10] migrate: Skeleton of live migration logic David Gibson
` (8 subsequent siblings)
9 siblings, 1 reply; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
---
test/passt.mbuto | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/test/passt.mbuto b/test/passt.mbuto
index 138d3653..d4d57cb3 100755
--- a/test/passt.mbuto
+++ b/test/passt.mbuto
@@ -13,7 +13,7 @@
PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname
sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
- nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}"
+ nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump}"
# OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
# sshd-session the per-session program. We need the latter as well, and the path
@@ -65,6 +65,7 @@ EOF
# sshd via vsock
cat > /etc/passwd << EOF
root:x:0:0:root:/root:/bin/sh
+tcpdump:x:72:72:tcpdump:/:/sbin/nologin
sshd:x:100:100:Privilege-separated SSH:/var/empty/sshd:/sbin/nologin
EOF
cat > /etc/shadow << EOF
--
@@ -13,7 +13,7 @@
PROGS="${PROGS:-ash,dash,bash ip mount ls insmod mkdir ln cat chmod lsmod
modprobe find grep mknod mv rm umount jq iperf3 dhclient hostname
sed tr chown sipcalc cut socat dd strace ping tail killall sleep sysctl
- nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp}"
+ nproc tcp_rr tcp_crr udp_rr which tee seq bc sshd ssh-keygen cmp tcpdump}"
# OpenSSH 9.8 introduced split binaries, with sshd being the daemon, and
# sshd-session the per-session program. We need the latter as well, and the path
@@ -65,6 +65,7 @@ EOF
# sshd via vsock
cat > /etc/passwd << EOF
root:x:0:0:root:/root:/bin/sh
+tcpdump:x:72:72:tcpdump:/:/sbin/nologin
sshd:x:100:100:Privilege-separated SSH:/var/empty/sshd:/sbin/nologin
EOF
cat > /etc/shadow << EOF
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v10 02/10] migrate: Skeleton of live migration logic
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
2025-02-06 5:49 ` [PATCH v10 01/10] debug: Add tcpdump to mbuto.img David Gibson
@ 2025-02-06 5:49 ` David Gibson
2025-02-06 5:49 ` [PATCH v10 03/10] fixup: Fix errors in modes that don't support migration David Gibson
` (7 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson, David Gibson
From: Stefano Brivio <sbrivio@redhat.com>
Introduce facilities for guest migration on top of vhost-user
infrastructure. Add migration facilities based on top of the current
vhost-user infrastructure, moving vu_migrate() and related functions
to migrate.c.
Versioned migration stages define function pointers to be called on
source or target, or data sections that need to be transferred.
The migration header consists of a magic number, a version number for the
encoding, and a "compat_version" which represents the oldest version which
is compatible with the current one. We don't use it yet, but that allows
for the future possibility of backwards compatible protocol extensions.
Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: David Gibson <david@tibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Message-ID: <20250205230919.205302-2-sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
Makefile | 12 +--
epoll_type.h | 2 -
migrate.c | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++
migrate.h | 56 +++++++++++++
passt.c | 7 +-
passt.h | 8 ++
util.h | 29 +++++++
vhost_user.c | 60 +++-----------
virtio.h | 4 -
vu_common.c | 49 +-----------
vu_common.h | 2 +-
11 files changed, 331 insertions(+), 115 deletions(-)
create mode 100644 migrate.c
create mode 100644 migrate.h
diff --git a/Makefile b/Makefile
index d3d4b780..be89b071 100644
--- a/Makefile
+++ b/Makefile
@@ -38,8 +38,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
- ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
- tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
+ ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \
+ tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
vhost_user.c virtio.c vu_common.c
QRAP_SRCS = qrap.c
PASST_REPAIR_SRCS = passt-repair.c
@@ -49,10 +49,10 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
- lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
- siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
- tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \
- virtio.h vu_common.h
+ lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
+ pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \
+ tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \
+ vhost_user.h virtio.h vu_common.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/epoll_type.h b/epoll_type.h
index fd9eac39..f3ef4158 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -40,8 +40,6 @@ enum epoll_type {
EPOLL_TYPE_VHOST_CMD,
/* vhost-user kick event socket */
EPOLL_TYPE_VHOST_KICK,
- /* vhost-user migration socket */
- EPOLL_TYPE_VHOST_MIGRATION,
EPOLL_NUM_TYPES,
};
diff --git a/migrate.c b/migrate.c
new file mode 100644
index 00000000..669016e1
--- /dev/null
+++ b/migrate.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * migrate.c - Migration sections, layout, and routines
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <errno.h>
+#include <sys/uio.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "inany.h"
+#include "flow.h"
+#include "flow_table.h"
+
+#include "migrate.h"
+
+/* Magic identifier for migration data */
+#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0
+
+/* Stages for version 1 */
+static const struct migrate_stage stages_v1[] = {
+ { 0 },
+};
+
+/* Supported encoding versions, from latest (most preferred) to oldest */
+static const struct migrate_version versions[] = {
+ { 1, stages_v1, },
+ { 0 },
+};
+
+/* Current encoding version */
+#define CURRENT_VERSION (&versions[0])
+
+/**
+ * migrate_source() - Migration as source, send state to hypervisor
+ * @c: Execution context
+ * @fd: File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+static int migrate_source(struct ctx *c, int fd)
+{
+ const struct migrate_version *v = CURRENT_VERSION;
+ const struct migrate_header header = {
+ .magic = htonll_constant(MIGRATE_MAGIC),
+ .version = htonl(v->id),
+ .compat_version = htonl(v->id),
+ };
+ const struct migrate_stage *s;
+ int ret;
+
+ ret = write_all_buf(fd, &header, sizeof(header));
+ if (ret) {
+ err("Can't send migration header: %s, abort", strerror_(ret));
+ return ret;
+ }
+
+ for (s = v->s; s->name; s++) {
+ if (!s->source)
+ continue;
+
+ debug("Source side migration: %s", s->name);
+
+ if ((ret = s->source(c, s, fd))) {
+ err("Source migration stage %s: %s, abort", s->name,
+ strerror_(ret));
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * migrate_target_read_header() - Read header in target
+ * @fd: Descriptor for state transfer
+ *
+ * Return: version structure on success, NULL on failure with errno set
+ */
+static const struct migrate_version *migrate_target_read_header(int fd)
+{
+ const struct migrate_version *v;
+ struct migrate_header h;
+ uint32_t id, compat_id;
+
+ if (read_all_buf(fd, &h, sizeof(h)))
+ return NULL;
+
+ id = ntohl(h.version);
+ compat_id = ntohl(h.compat_version);
+
+ debug("Source magic: 0x%016" PRIx64 ", version: %u, compat: %u",
+ ntohll(h.magic), id, compat_id);
+
+ if (ntohll(h.magic) != MIGRATE_MAGIC || !id || !compat_id) {
+ err("Invalid incoming device state");
+ errno = EINVAL;
+ return NULL;
+ }
+
+ for (v = versions; v->id; v++)
+ if (v->id <= id && v->id >= compat_id)
+ return v;
+
+ errno = ENOTSUP;
+ err("Unsupported device state version: %u", id);
+ return NULL;
+}
+
+/**
+ * migrate_target() - Migration as target, receive state from hypervisor
+ * @c: Execution context
+ * @fd: File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+static int migrate_target(struct ctx *c, int fd)
+{
+ const struct migrate_version *v;
+ const struct migrate_stage *s;
+ int ret;
+
+ v = migrate_target_read_header(fd);
+ if (!v)
+ return errno;
+
+ for (s = v->s; s->name; s++) {
+ if (!s->target)
+ continue;
+
+ debug("Target side migration: %s", s->name);
+
+ if ((ret = s->target(c, s, fd))) {
+ err("Target migration stage %s: %s, abort", s->name,
+ strerror_(ret));
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * migrate_init() - Set up things necessary for migration
+ * @c: Execution context
+ */
+void migrate_init(struct ctx *c)
+{
+ c->device_state_fd = -1;
+ c->device_state_result = -1;
+}
+
+/**
+ * migrate_close() - Close migration channel
+ * @c: Execution context
+ */
+void migrate_close(struct ctx *c)
+{
+ if (c->device_state_fd != -1) {
+ debug("Closing migration channel, fd: %d", c->device_state_fd);
+ close(c->device_state_fd);
+ c->device_state_fd = -1;
+ c->device_state_result = -1;
+ }
+}
+
+/**
+ * migrate_request() - Request a migration of device state
+ * @c: Execution context
+ * @fd: fd to transfer state
+ * @target: Are we the target of the migration?
+ */
+void migrate_request(struct ctx *c, int fd, bool target)
+{
+ debug("Migration requested, fd: %d (was %d)",
+ fd, c->device_state_fd);
+
+ if (c->device_state_fd != -1)
+ migrate_close(c);
+
+ c->device_state_fd = fd;
+ c->migrate_target = target;
+}
+
+/**
+ * migrate_handler() - Send/receive passt internal state to/from QEMU
+ * @c: Execution context
+ */
+void migrate_handler(struct ctx *c)
+{
+ int rc;
+
+ if (c->device_state_fd < 0)
+ return;
+
+ debug("migrate_handler fd %d target %d",
+ c->device_state_fd, c->migrate_target);
+
+ if (c->migrate_target)
+ rc = migrate_target(c, c->device_state_fd);
+ else
+ rc = migrate_source(c, c->device_state_fd);
+
+ migrate_close(c);
+
+ c->device_state_result = rc;
+}
diff --git a/migrate.h b/migrate.h
new file mode 100644
index 00000000..a5861120
--- /dev/null
+++ b/migrate.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef MIGRATE_H
+#define MIGRATE_H
+
+/**
+ * struct migrate_header - Migration header from source
+ * @magic: 0xB1BB1D1B0BB1D1B0, network order
+ * @version: Highest known, target aborts if too old, network order
+ * @compat_version: Lowest version compatible with @version, target aborts
+ * if too new, network order
+ */
+struct migrate_header {
+ uint64_t magic;
+ uint32_t version;
+ uint32_t compat_version;
+} __attribute__((packed));
+
+/**
+ * struct migrate_stage - Callbacks and parameters for one stage of migration
+ * @name: Stage name (for debugging)
+ * @source: Callback to implement this stage on the source
+ * @target: Callback to implement this stage on the target
+ * @iov: Optional data section to transfer
+ */
+struct migrate_stage {
+ const char *name;
+ int (*source)(struct ctx *c,
+ const struct migrate_stage *stage, int fd);
+ int (*target)(struct ctx *c,
+ const struct migrate_stage *stage, int fd);
+
+ /* FIXME: rollback callbacks? */
+
+ struct iovec iov;
+};
+
+/**
+ * struct migrate_version - Stages for a particular protocol version
+ * @id: Version number, host order
+ * @s: Ordered array of stages, NULL-terminated
+ */
+struct migrate_version {
+ uint32_t id;
+ const struct migrate_stage *s;
+};
+
+void migrate_init(struct ctx *c);
+void migrate_close(struct ctx *c);
+void migrate_request(struct ctx *c, int fd, bool target);
+void migrate_handler(struct ctx *c);
+
+#endif /* MIGRATE_H */
diff --git a/passt.c b/passt.c
index 53fdd38a..fc93a765 100644
--- a/passt.c
+++ b/passt.c
@@ -51,6 +51,7 @@
#include "tcp_splice.h"
#include "ndp.h"
#include "vu_common.h"
+#include "migrate.h"
#define EPOLL_EVENTS 8
@@ -75,7 +76,6 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
- [EPOLL_TYPE_VHOST_MIGRATION] = "vhost-user migration socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@@ -357,9 +357,6 @@ loop:
case EPOLL_TYPE_VHOST_KICK:
vu_kick_cb(c.vdev, ref, &now);
break;
- case EPOLL_TYPE_VHOST_MIGRATION:
- vu_migrate(c.vdev, eventmask);
- break;
default:
/* Can't happen */
ASSERT(0);
@@ -368,5 +365,7 @@ loop:
post_handler(&c, &now);
+ migrate_handler(&c);
+
goto loop;
}
diff --git a/passt.h b/passt.h
index 0dd4efa0..2255f182 100644
--- a/passt.h
+++ b/passt.h
@@ -235,6 +235,9 @@ struct ip6_ctx {
* @low_wmem: Low probed net.core.wmem_max
* @low_rmem: Low probed net.core.rmem_max
* @vdev: vhost-user device
+ * @device_state_fd: Device state migration channel
+ * @device_state_result: Device state migration result
+ * @migrate_target: Is this the target for next migration?
*/
struct ctx {
enum passt_modes mode;
@@ -300,6 +303,11 @@ struct ctx {
int low_rmem;
struct vu_dev *vdev;
+
+ /* Migration */
+ int device_state_fd;
+ int device_state_result;
+ bool migrate_target;
};
void proto_update_l2_buf(const unsigned char *eth_d,
diff --git a/util.h b/util.h
index 23b165c4..255eb262 100644
--- a/util.h
+++ b/util.h
@@ -122,14 +122,43 @@
(((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24))
#endif
+#ifndef __bswap_constant_32
+#define __bswap_constant_32(x) \
+ ((((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >> 8) | \
+ (((x) & 0x0000ff00) << 8) | (((x) & 0x000000ff) << 24))
+#endif
+
+#ifndef __bswap_constant_64
+#define __bswap_constant_64(x) \
+ ((((x) & 0xff00000000000000ULL) >> 56) | \
+ (((x) & 0x00ff000000000000ULL) >> 40) | \
+ (((x) & 0x0000ff0000000000ULL) >> 24) | \
+ (((x) & 0x000000ff00000000ULL) >> 8) | \
+ (((x) & 0x00000000ff000000ULL) << 8) | \
+ (((x) & 0x0000000000ff0000ULL) << 24) | \
+ (((x) & 0x000000000000ff00ULL) << 40) | \
+ (((x) & 0x00000000000000ffULL) << 56))
+#endif
+
#if __BYTE_ORDER == __BIG_ENDIAN
#define htons_constant(x) (x)
#define htonl_constant(x) (x)
+#define htonll_constant(x) (x)
+#define ntohs_constant(x) (x)
+#define ntohl_constant(x) (x)
+#define ntohll_constant(x) (x)
#else
#define htons_constant(x) (__bswap_constant_16(x))
#define htonl_constant(x) (__bswap_constant_32(x))
+#define htonll_constant(x) (__bswap_constant_64(x))
+#define ntohs_constant(x) (__bswap_constant_16(x))
+#define ntohl_constant(x) (__bswap_constant_32(x))
+#define ntohll_constant(x) (__bswap_constant_64(x))
#endif
+#define ntohll(x) (be64toh((x)))
+#define htonll(x) (htobe64((x)))
+
/**
* ntohl_unaligned() - Read 32-bit BE value from a possibly unaligned address
* @p: Pointer to the BE value in memory
diff --git a/vhost_user.c b/vhost_user.c
index 159f0b36..256c8ab6 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -44,6 +44,7 @@
#include "tap.h"
#include "vhost_user.h"
#include "pcap.h"
+#include "migrate.h"
/* vhost-user version we are compatible with */
#define VHOST_USER_VERSION 1
@@ -997,36 +998,6 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
return false;
}
-/**
- * vu_set_migration_watch() - Add the migration file descriptor to epoll
- * @vdev: vhost-user device
- * @fd: File descriptor to add
- * @direction: Direction of the migration (save or load backend state)
- */
-static void vu_set_migration_watch(const struct vu_dev *vdev, int fd,
- uint32_t direction)
-{
- union epoll_ref ref = {
- .type = EPOLL_TYPE_VHOST_MIGRATION,
- .fd = fd,
- };
- struct epoll_event ev = { 0 };
-
- ev.data.u64 = ref.u64;
- switch (direction) {
- case VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE:
- ev.events = EPOLLOUT;
- break;
- case VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD:
- ev.events = EPOLLIN;
- break;
- default:
- ASSERT(0);
- }
-
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
-}
-
/**
* vu_set_device_state_fd_exec() - Set the device state migration channel
* @vdev: vhost-user device
@@ -1051,16 +1022,8 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
die("Invalide device_state_fd direction: %d", direction);
- if (vdev->device_state_fd != -1) {
- epoll_del(vdev->context, vdev->device_state_fd);
- close(vdev->device_state_fd);
- }
-
- vdev->device_state_fd = msg->fds[0];
- vdev->device_state_result = -1;
- vu_set_migration_watch(vdev, vdev->device_state_fd, direction);
-
- debug("Got device_state_fd: %d", vdev->device_state_fd);
+ migrate_request(vdev->context, msg->fds[0],
+ direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
/* We don't provide a new fd for the data transfer */
vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
@@ -1075,12 +1038,11 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
*
* Return: True as the reply contains the migration result
*/
+/* cppcheck-suppress constParameterCallback */
static bool vu_check_device_state_exec(struct vu_dev *vdev,
struct vhost_user_msg *msg)
{
- (void)vdev;
-
- vmsg_set_reply_u64(msg, vdev->device_state_result);
+ vmsg_set_reply_u64(msg, vdev->context->device_state_result);
return true;
}
@@ -1106,8 +1068,8 @@ void vu_init(struct ctx *c)
}
c->vdev->log_table = NULL;
c->vdev->log_call_fd = -1;
- c->vdev->device_state_fd = -1;
- c->vdev->device_state_result = -1;
+
+ migrate_init(c);
}
@@ -1157,12 +1119,8 @@ void vu_cleanup(struct vu_dev *vdev)
vu_close_log(vdev);
- if (vdev->device_state_fd != -1) {
- epoll_del(vdev->context, vdev->device_state_fd);
- close(vdev->device_state_fd);
- vdev->device_state_fd = -1;
- vdev->device_state_result = -1;
- }
+ /* If we lose the VU dev, we also lose our migration channel */
+ migrate_close(vdev->context);
}
/**
diff --git a/virtio.h b/virtio.h
index 7bef2d27..0a59441b 100644
--- a/virtio.h
+++ b/virtio.h
@@ -106,8 +106,6 @@ struct vu_dev_region {
* @log_call_fd: Eventfd to report logging update
* @log_size: Size of the logging memory region
* @log_table: Base of the logging memory region
- * @device_state_fd: Device state migration channel
- * @device_state_result: Device state migration result
*/
struct vu_dev {
struct ctx *context;
@@ -119,8 +117,6 @@ struct vu_dev {
int log_call_fd;
uint64_t log_size;
uint8_t *log_table;
- int device_state_fd;
- int device_state_result;
};
/**
diff --git a/vu_common.c b/vu_common.c
index ab04d31e..48826b13 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -5,6 +5,7 @@
* common_vu.c - vhost-user common UDP and TCP functions
*/
+#include <errno.h>
#include <unistd.h>
#include <sys/uio.h>
#include <sys/eventfd.h>
@@ -17,6 +18,7 @@
#include "vhost_user.h"
#include "pcap.h"
#include "vu_common.h"
+#include "migrate.h"
#define VU_MAX_TX_BUFFER_NB 2
@@ -303,50 +305,3 @@ err:
return -1;
}
-
-/**
- * vu_migrate() - Send/receive passt insternal state to/from QEMU
- * @vdev: vhost-user device
- * @events: epoll events
- */
-void vu_migrate(struct vu_dev *vdev, uint32_t events)
-{
- int ret;
-
- /* TODO: collect/set passt internal state
- * and use vdev->device_state_fd to send/receive it
- */
- debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
- if (events & EPOLLOUT) {
- debug("Saving backend state");
-
- /* send some stuff */
- ret = write(vdev->device_state_fd, "PASST", 6);
- /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
- vdev->device_state_result = ret == -1 ? -1 : 0;
- /* Closing the file descriptor signals the end of transfer */
- epoll_del(vdev->context, vdev->device_state_fd);
- close(vdev->device_state_fd);
- vdev->device_state_fd = -1;
- } else if (events & EPOLLIN) {
- char buf[6];
-
- debug("Loading backend state");
- /* read some stuff */
- ret = read(vdev->device_state_fd, buf, sizeof(buf));
- /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
- if (ret != sizeof(buf)) {
- vdev->device_state_result = -1;
- } else {
- ret = strncmp(buf, "PASST", sizeof(buf));
- vdev->device_state_result = ret == 0 ? 0 : -1;
- }
- } else if (events & EPOLLHUP) {
- debug("Closing migration channel");
-
- /* The end of file signals the end of the transfer. */
- epoll_del(vdev->context, vdev->device_state_fd);
- close(vdev->device_state_fd);
- vdev->device_state_fd = -1;
- }
-}
diff --git a/vu_common.h b/vu_common.h
index d56c021a..f538f237 100644
--- a/vu_common.h
+++ b/vu_common.h
@@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now);
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
-void vu_migrate(struct vu_dev *vdev, uint32_t events);
+
#endif /* VU_COMMON_H */
--
@@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now);
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
-void vu_migrate(struct vu_dev *vdev, uint32_t events);
+
#endif /* VU_COMMON_H */
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v10 03/10] fixup: Fix errors in modes that don't support migration
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
2025-02-06 5:49 ` [PATCH v10 01/10] debug: Add tcpdump to mbuto.img David Gibson
2025-02-06 5:49 ` [PATCH v10 02/10] migrate: Skeleton of live migration logic David Gibson
@ 2025-02-06 5:49 ` David Gibson
2025-02-06 5:49 ` [PATCH v10 04/10] migrate: Migrate guest observed addresses David Gibson
` (6 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
Migration is only supported in vhost user mode. But, we need to initialise
c->device_state_fd to -1 even in modes without migration so that
migrate_handler() knows it doesn't have anything to do.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
migrate.c | 1 -
passt.c | 1 +
2 files changed, 1 insertion(+), 1 deletion(-)
diff --git a/migrate.c b/migrate.c
index 669016e1..b5d87547 100644
--- a/migrate.c
+++ b/migrate.c
@@ -156,7 +156,6 @@ static int migrate_target(struct ctx *c, int fd)
*/
void migrate_init(struct ctx *c)
{
- c->device_state_fd = -1;
c->device_state_result = -1;
}
diff --git a/passt.c b/passt.c
index fc93a765..935a69f1 100644
--- a/passt.c
+++ b/passt.c
@@ -202,6 +202,7 @@ int main(int argc, char **argv)
isolate_initial(argc, argv);
c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1;
+ c.device_state_fd = -1;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
--
@@ -202,6 +202,7 @@ int main(int argc, char **argv)
isolate_initial(argc, argv);
c.pasta_netns_fd = c.fd_tap = c.pidfile_fd = -1;
+ c.device_state_fd = -1;
sigemptyset(&sa.sa_mask);
sa.sa_flags = 0;
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v10 04/10] migrate: Migrate guest observed addresses
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
` (2 preceding siblings ...)
2025-02-06 5:49 ` [PATCH v10 03/10] fixup: Fix errors in modes that don't support migration David Gibson
@ 2025-02-06 5:49 ` David Gibson
2025-02-06 5:49 ` [PATCH v10 05/10] Add interfaces and configuration bits for passt-repair David Gibson
` (5 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
Most of the information in struct ctx doesn't need to be migrated.
Either it's strictly back end information which is allowed to differ
between the two ends, or it must already be configured identically on
the two ends.
There are a few exceptions though. In particular passt learns several
addresses of the guest by observing what it sends out. If we lose
this information across migration we might get away with it, but if
there are active flows we might misdirect some packets before
re-learning the guest address.
Avoid this by migrating the guest's observed addresses.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
migrate.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 50 insertions(+)
diff --git a/migrate.c b/migrate.c
index b5d87547..ba8332df 100644
--- a/migrate.c
+++ b/migrate.c
@@ -27,8 +27,58 @@
/* Magic identifier for migration data */
#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0
+/**
+ * struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream
+ * @addr6: Observed guest IPv6 address
+ * @addr6_ll: Observed guest IPv6 link-local address
+ * @addr4: Observed guest IPv4 address
+ * @mac: Observed guest MAC address
+ */
+struct migrate_seen_addrs_v1 {
+ struct in6_addr addr6;
+ struct in6_addr addr6_ll;
+ struct in_addr addr4;
+ unsigned char mac[ETH_ALEN];
+} __attribute__((packed));
+
+static int seen_addrs_source_v1(struct ctx *c,
+ const struct migrate_stage *stage, int fd)
+{
+ struct migrate_seen_addrs_v1 addrs = {
+ .addr6 = c->ip6.addr_seen,
+ .addr6_ll = c->ip6.addr_ll_seen,
+ .addr4 = c->ip4.addr_seen,
+ };
+
+ (void)stage;
+ memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac));
+ return write_all_buf(fd, &addrs, sizeof(addrs));
+}
+
+static int seen_addrs_target_v1(struct ctx *c,
+ const struct migrate_stage *stage, int fd)
+{
+ struct migrate_seen_addrs_v1 addrs;
+
+ (void)stage;
+
+ if (read_all_buf(fd, &addrs, sizeof(addrs)))
+ return errno;
+
+ c->ip6.addr_seen = addrs.addr6;
+ c->ip6.addr_ll_seen = addrs.addr6_ll;
+ c->ip4.addr_seen = addrs.addr4;
+ memcpy(c->guest_mac, addrs.mac, sizeof(c->guest_mac));
+ return 0;
+}
+
/* Stages for version 1 */
static const struct migrate_stage stages_v1[] = {
+ {
+ .name = "seen addresses",
+ .source = seen_addrs_source_v1,
+ .target = seen_addrs_target_v1,
+ },
{ 0 },
};
--
@@ -27,8 +27,58 @@
/* Magic identifier for migration data */
#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0
+/**
+ * struct migrate_seen_addrs_v1 - Migratable guest addresses for v1 state stream
+ * @addr6: Observed guest IPv6 address
+ * @addr6_ll: Observed guest IPv6 link-local address
+ * @addr4: Observed guest IPv4 address
+ * @mac: Observed guest MAC address
+ */
+struct migrate_seen_addrs_v1 {
+ struct in6_addr addr6;
+ struct in6_addr addr6_ll;
+ struct in_addr addr4;
+ unsigned char mac[ETH_ALEN];
+} __attribute__((packed));
+
+static int seen_addrs_source_v1(struct ctx *c,
+ const struct migrate_stage *stage, int fd)
+{
+ struct migrate_seen_addrs_v1 addrs = {
+ .addr6 = c->ip6.addr_seen,
+ .addr6_ll = c->ip6.addr_ll_seen,
+ .addr4 = c->ip4.addr_seen,
+ };
+
+ (void)stage;
+ memcpy(addrs.mac, c->guest_mac, sizeof(addrs.mac));
+ return write_all_buf(fd, &addrs, sizeof(addrs));
+}
+
+static int seen_addrs_target_v1(struct ctx *c,
+ const struct migrate_stage *stage, int fd)
+{
+ struct migrate_seen_addrs_v1 addrs;
+
+ (void)stage;
+
+ if (read_all_buf(fd, &addrs, sizeof(addrs)))
+ return errno;
+
+ c->ip6.addr_seen = addrs.addr6;
+ c->ip6.addr_ll_seen = addrs.addr6_ll;
+ c->ip4.addr_seen = addrs.addr4;
+ memcpy(c->guest_mac, addrs.mac, sizeof(c->guest_mac));
+ return 0;
+}
+
/* Stages for version 1 */
static const struct migrate_stage stages_v1[] = {
+ {
+ .name = "seen addresses",
+ .source = seen_addrs_source_v1,
+ .target = seen_addrs_target_v1,
+ },
{ 0 },
};
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v10 05/10] Add interfaces and configuration bits for passt-repair
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
` (3 preceding siblings ...)
2025-02-06 5:49 ` [PATCH v10 04/10] migrate: Migrate guest observed addresses David Gibson
@ 2025-02-06 5:49 ` David Gibson
2025-02-06 5:49 ` [PATCH v10 06/10] vhost_user: Make source quit after reporting migration state David Gibson
` (4 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
From: Stefano Brivio <sbrivio@redhat.com>
In vhost-user mode, by default, create a second UNIX domain socket
accepting connections from passt-repair, with the usual listener
socket.
When we need to set or clear TCP_REPAIR on sockets, we'll send them
via SCM_RIGHTS to passt-repair, who sets the socket option values we
ask for.
To that end, introduce batched functions to request TCP_REPAIR
settings on sockets, so that we don't have to send a single message
for each socket, on migration. When needed, repair_flush() will
send the message and check for the reply.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Message-ID: <20250205003904.2797491-5-sbrivio@redhat.com>
Message-ID: <20250205230919.205302-3-sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
Makefile | 12 ++--
conf.c | 44 ++++++++++--
epoll_type.h | 4 ++
migrate.c | 5 +-
passt.1 | 11 +++
passt.c | 9 +++
passt.h | 7 ++
repair.c | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++
repair.h | 16 +++++
tap.c | 65 +----------------
util.c | 62 +++++++++++++++++
util.h | 1 +
12 files changed, 356 insertions(+), 73 deletions(-)
create mode 100644 repair.c
create mode 100644 repair.h
diff --git a/Makefile b/Makefile
index be89b071..d4e10967 100644
--- a/Makefile
+++ b/Makefile
@@ -38,9 +38,9 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
- ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \
- tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
- vhost_user.c virtio.c vu_common.c
+ ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \
+ repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \
+ udp_vu.c util.c vhost_user.c virtio.c vu_common.c
QRAP_SRCS = qrap.c
PASST_REPAIR_SRCS = passt-repair.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
@@ -50,9 +50,9 @@ MANPAGES = passt.1 pasta.1 qrap.1 passt-repair.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
- pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \
- tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \
- vhost_user.h virtio.h vu_common.h
+ pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \
+ tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \
+ udp_vu.h util.h vhost_user.h virtio.h vu_common.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/conf.c b/conf.c
index 6817377a..dcfc1d64 100644
--- a/conf.c
+++ b/conf.c
@@ -816,6 +816,9 @@ static void usage(const char *name, FILE *f, int status)
" UNIX domain socket is provided by -s option\n"
" --print-capabilities print back-end capabilities in JSON format,\n"
" only meaningful for vhost-user mode\n");
+ FPRINTF(f,
+ " --repair-path PATH path for passt-repair(1)\n"
+ " default: append '.repair' to UNIX domain path\n");
}
FPRINTF(f,
@@ -1240,8 +1243,25 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
*/
static void conf_open_files(struct ctx *c)
{
- if (c->mode != MODE_PASTA && c->fd_tap == -1)
- c->fd_tap_listen = tap_sock_unix_open(c->sock_path);
+ if (c->mode != MODE_PASTA && c->fd_tap == -1) {
+ c->fd_tap_listen = sock_unix(c->sock_path);
+
+ if (c->mode == MODE_VU && strcmp(c->repair_path, "none")) {
+ if (!*c->repair_path &&
+ snprintf_check(c->repair_path,
+ sizeof(c->repair_path), "%s.repair",
+ c->sock_path)) {
+ warn("passt-repair path %s not usable",
+ c->repair_path);
+ c->fd_repair_listen = -1;
+ } else {
+ c->fd_repair_listen = sock_unix(c->repair_path);
+ }
+ } else {
+ c->fd_repair_listen = -1;
+ }
+ c->fd_repair = -1;
+ }
if (*c->pidfile) {
c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
@@ -1354,9 +1374,12 @@ void conf(struct ctx *c, int argc, char **argv)
{"host-lo-to-ns-lo", no_argument, NULL, 23 },
{"dns-host", required_argument, NULL, 24 },
{"vhost-user", no_argument, NULL, 25 },
+
/* vhost-user backend program convention */
{"print-capabilities", no_argument, NULL, 26 },
{"socket-path", required_argument, NULL, 's' },
+
+ {"repair-path", required_argument, NULL, 27 },
{ 0 },
};
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@@ -1748,6 +1771,9 @@ void conf(struct ctx *c, int argc, char **argv)
case 'D':
/* Handle these later, once addresses are configured */
break;
+ case 27:
+ /* Handle this once we checked --vhost-user */
+ break;
case 'h':
usage(argv[0], stdout, EXIT_SUCCESS);
break;
@@ -1824,8 +1850,8 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
c->no_dhcp = 1;
- /* Inbound port options & DNS can be parsed now (after IPv4/IPv6
- * settings)
+ /* Inbound port options, DNS, and --repair-path can be parsed now, after
+ * IPv4/IPv6 settings and --vhost-user.
*/
fwd_probe_ephemeral();
udp_portmap_clear();
@@ -1871,6 +1897,16 @@ void conf(struct ctx *c, int argc, char **argv)
}
die("Cannot use DNS address %s", optarg);
+ } else if (name == 27) {
+ if (c->mode != MODE_VU && strcmp(optarg, "none"))
+ die("--repair-path is for vhost-user mode only");
+
+ if (snprintf_check(c->repair_path,
+ sizeof(c->repair_path), "%s",
+ optarg))
+ die("Invalid passt-repair path: %s", optarg);
+
+ break;
}
} while (name != -1);
diff --git a/epoll_type.h b/epoll_type.h
index f3ef4158..7f2a1217 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -40,6 +40,10 @@ enum epoll_type {
EPOLL_TYPE_VHOST_CMD,
/* vhost-user kick event socket */
EPOLL_TYPE_VHOST_KICK,
+ /* TCP_REPAIR helper listening socket */
+ EPOLL_TYPE_REPAIR_LISTEN,
+ /* TCP_REPAIR helper socket */
+ EPOLL_TYPE_REPAIR,
EPOLL_NUM_TYPES,
};
diff --git a/migrate.c b/migrate.c
index ba8332df..f93cff4c 100644
--- a/migrate.c
+++ b/migrate.c
@@ -23,6 +23,7 @@
#include "flow_table.h"
#include "migrate.h"
+#include "repair.h"
/* Magic identifier for migration data */
#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0
@@ -210,7 +211,7 @@ void migrate_init(struct ctx *c)
}
/**
- * migrate_close() - Close migration channel
+ * migrate_close() - Close migration channel and connection to passt-repair
* @c: Execution context
*/
void migrate_close(struct ctx *c)
@@ -221,6 +222,8 @@ void migrate_close(struct ctx *c)
c->device_state_fd = -1;
c->device_state_result = -1;
}
+
+ repair_close(c);
}
/**
diff --git a/passt.1 b/passt.1
index d9cd33e3..63a3a01e 100644
--- a/passt.1
+++ b/passt.1
@@ -418,6 +418,17 @@ Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
.BR \-\-print-capabilities
Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
+.TP
+.BR \-\-repair-path " " \fIpath
+Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect
+to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during
+migration. \fB--repair-path none\fR disables this interface (if you need to
+specify a socket path called "none" you can prefix the path by \fI./\fR).
+
+Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
+chosen for the hypervisor UNIX domain socket. No socket is created if not in
+\-\-vhost-user mode.
+
.TP
.BR \-F ", " \-\-fd " " \fIFD
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
diff --git a/passt.c b/passt.c
index 935a69f1..6f9fb4d9 100644
--- a/passt.c
+++ b/passt.c
@@ -52,6 +52,7 @@
#include "ndp.h"
#include "vu_common.h"
#include "migrate.h"
+#include "repair.h"
#define EPOLL_EVENTS 8
@@ -76,6 +77,8 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
+ [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
+ [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@@ -358,6 +361,12 @@ loop:
case EPOLL_TYPE_VHOST_KICK:
vu_kick_cb(c.vdev, ref, &now);
break;
+ case EPOLL_TYPE_REPAIR_LISTEN:
+ repair_listen_handler(&c, eventmask);
+ break;
+ case EPOLL_TYPE_REPAIR:
+ repair_handler(&c, eventmask);
+ break;
default:
/* Can't happen */
ASSERT(0);
diff --git a/passt.h b/passt.h
index 2255f182..4189a4a5 100644
--- a/passt.h
+++ b/passt.h
@@ -20,6 +20,7 @@ union epoll_ref;
#include "siphash.h"
#include "ip.h"
#include "inany.h"
+#include "migrate.h"
#include "flow.h"
#include "icmp.h"
#include "fwd.h"
@@ -193,6 +194,7 @@ struct ip6_ctx {
* @foreground: Run in foreground, don't log to stderr by default
* @nofile: Maximum number of open files (ulimit -n)
* @sock_path: Path for UNIX domain socket
+ * @repair_path: TCP_REPAIR helper path, can be "none", empty for default
* @pcap: Path for packet capture file
* @pidfile: Path to PID file, empty string if not configured
* @pidfile_fd: File descriptor for PID file, -1 if none
@@ -203,6 +205,8 @@ struct ip6_ctx {
* @epollfd: File descriptor for epoll instance
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
+ * @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any
+ * @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper
* @our_tap_mac: Pasta/passt's MAC on the tap link
* @guest_mac: MAC address of guest or namespace, seen or configured
* @hash_secret: 128-bit secret for siphash functions
@@ -247,6 +251,7 @@ struct ctx {
int foreground;
int nofile;
char sock_path[UNIX_PATH_MAX];
+ char repair_path[UNIX_PATH_MAX];
char pcap[PATH_MAX];
char pidfile[PATH_MAX];
@@ -263,6 +268,8 @@ struct ctx {
int epollfd;
int fd_tap_listen;
int fd_tap;
+ int fd_repair_listen;
+ int fd_repair;
unsigned char our_tap_mac[ETH_ALEN];
unsigned char guest_mac[ETH_ALEN];
uint64_t hash_secret[2];
diff --git a/repair.c b/repair.c
new file mode 100644
index 00000000..61519279
--- /dev/null
+++ b/repair.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <errno.h>
+#include <sys/uio.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "inany.h"
+#include "flow.h"
+#include "flow_table.h"
+
+#include "repair.h"
+
+#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+
+static int repair_fds[SCM_MAX_FD];
+static int repair_cmd;
+static int repair_nfds;
+
+/**
+ * repair_sock_init() - Start listening for connections on helper socket
+ * @c: Execution context
+ */
+void repair_sock_init(const struct ctx *c)
+{
+ union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN };
+ struct epoll_event ev = { 0 };
+
+ listen(c->fd_repair_listen, 0);
+
+ ref.fd = c->fd_repair_listen;
+ ev.events = EPOLLIN | EPOLLHUP | EPOLLET;
+ ev.data.u64 = ref.u64;
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev);
+}
+
+/**
+ * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
+ * @c: Execution context
+ * @events: epoll events
+ */
+void repair_listen_handler(struct ctx *c, uint32_t events)
+{
+ union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
+ struct epoll_event ev = { 0 };
+ struct ucred ucred;
+ socklen_t len;
+
+ if (events != EPOLLIN) {
+ debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
+ events);
+ return;
+ }
+
+ len = sizeof(ucred);
+
+ /* Another client is already connected: accept and close right away. */
+ if (c->fd_repair != -1) {
+ int discard = accept4(c->fd_repair_listen, NULL, NULL,
+ SOCK_NONBLOCK);
+
+ if (discard == -1)
+ return;
+
+ if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
+ info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
+
+ close(discard);
+ return;
+ }
+
+ c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0);
+
+ if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
+ info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
+
+ ref.fd = c->fd_repair;
+ ev.events = EPOLLHUP | EPOLLET;
+ ev.data.u64 = ref.u64;
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev);
+}
+
+/**
+ * repair_close() - Close connection to TCP_REPAIR helper
+ * @c: Execution context
+ */
+void repair_close(struct ctx *c)
+{
+ debug("Closing TCP_REPAIR helper socket");
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL);
+ close(c->fd_repair);
+ c->fd_repair = -1;
+}
+
+/**
+ * repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket
+ * @c: Execution context
+ * @events: epoll events
+ */
+void repair_handler(struct ctx *c, uint32_t events)
+{
+ (void)events;
+
+ repair_close(c);
+}
+
+/**
+ * repair_flush() - Flush current set of sockets to helper, with current command
+ * @c: Execution context
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int repair_flush(struct ctx *c)
+{
+ struct iovec iov = { &((int8_t){ repair_cmd }), sizeof(int8_t) };
+ char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
+ __attribute__ ((aligned(__alignof__(struct cmsghdr))));
+ struct cmsghdr *cmsg;
+ struct msghdr msg;
+
+ if (!repair_nfds)
+ return 0;
+
+ msg = (struct msghdr){ NULL, 0, &iov, 1,
+ buf, CMSG_SPACE(sizeof(int) * repair_nfds), 0 };
+ cmsg = CMSG_FIRSTHDR(&msg);
+
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds);
+ memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds);
+
+ repair_nfds = 0;
+
+ if (sendmsg(c->fd_repair, &msg, 0) < 0) {
+ int ret = -errno;
+ err_perror("Failed to send sockets to TCP_REPAIR helper");
+ repair_close(c);
+ return ret;
+ }
+
+ if (recv(c->fd_repair, &((int8_t){ 0 }), 1, 0) < 0) {
+ int ret = -errno;
+ err_perror("Failed to receive reply from TCP_REPAIR helper");
+ repair_close(c);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * repair_flush() - Add socket to TCP_REPAIR set with given command
+ * @c: Execution context
+ * @s: Socket to add
+ * @cmd: TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+/* cppcheck-suppress unusedFunction */
+int repair_set(struct ctx *c, int s, int cmd)
+{
+ int rc;
+
+ if (repair_nfds && repair_cmd != cmd) {
+ if ((rc = repair_flush(c)))
+ return rc;
+ }
+
+ repair_cmd = cmd;
+ repair_fds[repair_nfds++] = s;
+
+ if (repair_nfds >= SCM_MAX_FD) {
+ if ((rc = repair_flush(c)))
+ return rc;
+ }
+
+ return 0;
+}
diff --git a/repair.h b/repair.h
new file mode 100644
index 00000000..de279d60
--- /dev/null
+++ b/repair.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef REPAIR_H
+#define REPAIR_H
+
+void repair_sock_init(const struct ctx *c);
+void repair_listen_handler(struct ctx *c, uint32_t events);
+void repair_handler(struct ctx *c, uint32_t events);
+void repair_close(struct ctx *c);
+int repair_flush(struct ctx *c);
+int repair_set(struct ctx *c, int s, int cmd);
+
+#endif /* REPAIR_H */
diff --git a/tap.c b/tap.c
index 8c92d231..d0673e58 100644
--- a/tap.c
+++ b/tap.c
@@ -56,6 +56,7 @@
#include "netlink.h"
#include "pasta.h"
#include "packet.h"
+#include "repair.h"
#include "tap.h"
#include "log.h"
#include "vhost_user.h"
@@ -1151,68 +1152,6 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
tap_pasta_input(c, now);
}
-/**
- * tap_sock_unix_open() - Create and bind AF_UNIX socket
- * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
- *
- * Return: socket descriptor on success, won't return on failure
- */
-int tap_sock_unix_open(char *sock_path)
-{
- int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
- struct sockaddr_un addr = {
- .sun_family = AF_UNIX,
- };
- int i;
-
- if (fd < 0)
- die_perror("Failed to open UNIX domain socket");
-
- for (i = 1; i < UNIX_SOCK_MAX; i++) {
- char *path = addr.sun_path;
- int ex, ret;
-
- if (*sock_path)
- memcpy(path, sock_path, UNIX_PATH_MAX);
- else if (snprintf_check(path, UNIX_PATH_MAX - 1,
- UNIX_SOCK_PATH, i))
- die_perror("Can't build UNIX domain socket path");
-
- ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
- 0);
- if (ex < 0)
- die_perror("Failed to check for UNIX domain conflicts");
-
- ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
- if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
- errno != EACCES)) {
- if (*sock_path)
- die("Socket path %s already in use", path);
-
- close(ex);
- continue;
- }
- close(ex);
-
- unlink(path);
- ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
- if (*sock_path && ret)
- die_perror("Failed to bind UNIX domain socket");
-
- if (!ret)
- break;
- }
-
- if (i == UNIX_SOCK_MAX)
- die_perror("Failed to bind UNIX domain socket");
-
- info("UNIX domain socket bound at %s", addr.sun_path);
- if (!*sock_path)
- memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
-
- return fd;
-}
-
/**
* tap_backend_show_hints() - Give help information to start QEMU
* @c: Execution context
@@ -1423,6 +1362,8 @@ void tap_backend_init(struct ctx *c)
tap_sock_tun_init(c);
break;
case MODE_VU:
+ repair_sock_init(c);
+ /* fall through */
case MODE_PASST:
tap_sock_unix_init(c);
diff --git a/util.c b/util.c
index 4d51e040..c3c54806 100644
--- a/util.c
+++ b/util.c
@@ -178,6 +178,68 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return fd;
}
+/**
+ * sock_unix() - Create and bind AF_UNIX socket
+ * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
+ *
+ * Return: socket descriptor on success, won't return on failure
+ */
+int sock_unix(char *sock_path)
+{
+ int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ int i;
+
+ if (fd < 0)
+ die_perror("Failed to open UNIX domain socket");
+
+ for (i = 1; i < UNIX_SOCK_MAX; i++) {
+ char *path = addr.sun_path;
+ int ex, ret;
+
+ if (*sock_path)
+ memcpy(path, sock_path, UNIX_PATH_MAX);
+ else if (snprintf_check(path, UNIX_PATH_MAX - 1,
+ UNIX_SOCK_PATH, i))
+ die_perror("Can't build UNIX domain socket path");
+
+ ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+ 0);
+ if (ex < 0)
+ die_perror("Failed to check for UNIX domain conflicts");
+
+ ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
+ if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
+ errno != EACCES)) {
+ if (*sock_path)
+ die("Socket path %s already in use", path);
+
+ close(ex);
+ continue;
+ }
+ close(ex);
+
+ unlink(path);
+ ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
+ if (*sock_path && ret)
+ die_perror("Failed to bind UNIX domain socket");
+
+ if (!ret)
+ break;
+ }
+
+ if (i == UNIX_SOCK_MAX)
+ die_perror("Failed to bind UNIX domain socket");
+
+ info("UNIX domain socket bound at %s", addr.sun_path);
+ if (!*sock_path)
+ memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
+
+ return fd;
+}
+
/**
* sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
* @c: Execution context
diff --git a/util.h b/util.h
index 255eb262..3dacb4d9 100644
--- a/util.h
+++ b/util.h
@@ -214,6 +214,7 @@ struct ctx;
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
const char *ifname, bool v6only, uint32_t data);
+int sock_unix(char *sock_path);
void sock_probe_mem(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);
--
@@ -214,6 +214,7 @@ struct ctx;
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
const char *ifname, bool v6only, uint32_t data);
+int sock_unix(char *sock_path);
void sock_probe_mem(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v10 06/10] vhost_user: Make source quit after reporting migration state
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
` (4 preceding siblings ...)
2025-02-06 5:49 ` [PATCH v10 05/10] Add interfaces and configuration bits for passt-repair David Gibson
@ 2025-02-06 5:49 ` David Gibson
2025-02-06 5:49 ` [PATCH v10 07/10] migrate: Hack for late migration fixups David Gibson
` (3 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
From: Stefano Brivio <sbrivio@redhat.com>
On migration, the source process asks passt-helper to set TCP sockets
in repair mode, dumps the information we need to migrate connections,
and closes them.
At this point, we can't pass them back to passt-helper using
SCM_RIGHTS, because they are closed, from that perspective, and
sendmsg() will give us EBADF. But if we don't clear repair mode, the
port they are bound to will not be available for binding in the
target.
Terminate once we're done with the migration and we reported the
state. This is equivalent to clearing repair mode on the sockets we
just closed.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Message-ID: <20250205003904.2797491-6-sbrivio@redhat.com>
Message-ID: <20250205230919.205302-4-sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
vhost_user.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/vhost_user.c b/vhost_user.c
index 256c8ab6..9870a4f4 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -998,6 +998,8 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
return false;
}
+static bool quit_on_device_state = false;
+
/**
* vu_set_device_state_fd_exec() - Set the device state migration channel
* @vdev: vhost-user device
@@ -1025,6 +1027,9 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
migrate_request(vdev->context, msg->fds[0],
direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
+ if (direction == VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE)
+ quit_on_device_state = true;
+
/* We don't provide a new fd for the data transfer */
vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
@@ -1203,4 +1208,10 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
if (reply_requested)
vu_send_reply(fd, &msg);
+
+ if (quit_on_device_state &&
+ msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE) {
+ info("Migration complete, exiting");
+ exit(EXIT_SUCCESS);
+ }
}
--
@@ -998,6 +998,8 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
return false;
}
+static bool quit_on_device_state = false;
+
/**
* vu_set_device_state_fd_exec() - Set the device state migration channel
* @vdev: vhost-user device
@@ -1025,6 +1027,9 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
migrate_request(vdev->context, msg->fds[0],
direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
+ if (direction == VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE)
+ quit_on_device_state = true;
+
/* We don't provide a new fd for the data transfer */
vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
@@ -1203,4 +1208,10 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
if (reply_requested)
vu_send_reply(fd, &msg);
+
+ if (quit_on_device_state &&
+ msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE) {
+ info("Migration complete, exiting");
+ exit(EXIT_SUCCESS);
+ }
}
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v10 07/10] migrate: Hack for late migration fixups
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
` (5 preceding siblings ...)
2025-02-06 5:49 ` [PATCH v10 06/10] vhost_user: Make source quit after reporting migration state David Gibson
@ 2025-02-06 5:49 ` David Gibson
2025-02-06 5:49 ` [PATCH v10 08/10] migrate: Migrate TCP flows David Gibson
` (2 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
Abuse the VHOST_USER_SEND_RARP command to do operations that need to
happen very late on the target side migration. Nothing is in there now and
with some luck we'll be able to drop this completely before merge.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Message-ID: <20250205230919.205302-5-sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
migrate.c | 11 +++++++++++
migrate.h | 1 +
vhost_user.c | 3 +++
3 files changed, 15 insertions(+)
diff --git a/migrate.c b/migrate.c
index f93cff4c..9deee7ad 100644
--- a/migrate.c
+++ b/migrate.c
@@ -267,3 +267,14 @@ void migrate_handler(struct ctx *c)
c->device_state_result = rc;
}
+
+/**
+ * migrate_finish() - Hack to connect() migrated sockets from "RARP" trigger
+ * @c: Execution context
+ */
+void migrate_finish(struct ctx *c)
+{
+ (void)c;
+
+ /* HACK RARP: flow_migrate_target_post(c); */
+}
diff --git a/migrate.h b/migrate.h
index a5861120..5311f0e0 100644
--- a/migrate.h
+++ b/migrate.h
@@ -52,5 +52,6 @@ void migrate_init(struct ctx *c);
void migrate_close(struct ctx *c);
void migrate_request(struct ctx *c, int fd, bool target);
void migrate_handler(struct ctx *c);
+void migrate_finish(struct ctx *c);
#endif /* MIGRATE_H */
diff --git a/vhost_user.c b/vhost_user.c
index 9870a4f4..8ce22db7 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -995,6 +995,9 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
eth_ntop((unsigned char *)&msg->payload.u64, macstr,
sizeof(macstr)));
+ /* Abuse this as trigger to finally connect() migrated sockets */
+ migrate_finish(vdev->context);
+
return false;
}
--
@@ -995,6 +995,9 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
eth_ntop((unsigned char *)&msg->payload.u64, macstr,
sizeof(macstr)));
+ /* Abuse this as trigger to finally connect() migrated sockets */
+ migrate_finish(vdev->context);
+
return false;
}
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v10 08/10] migrate: Migrate TCP flows
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
` (6 preceding siblings ...)
2025-02-06 5:49 ` [PATCH v10 07/10] migrate: Hack for late migration fixups David Gibson
@ 2025-02-06 5:49 ` David Gibson
2025-02-06 5:49 ` [PATCH v10 09/10] fixup: Reset SO_PEEK_OFF value after incoming migration David Gibson
2025-02-06 5:49 ` [PATCH v10 10/10] test: Add migrate/basic tests David Gibson
9 siblings, 0 replies; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
From: Stefano Brivio <sbrivio@redhat.com>
This implements flow preparation on the source, transfer of data with
a format roughly inspired by struct tcp_tap_conn, and flow insertion
on the target, with all the appropriate window options, window
scaling, MSS, etc.
The target side is rather convoluted because we first need to create
sockets and switch them to repair mode, before we can apply options
that are *not* stored in the flow table. However, we don't want to
request repair mode for sockets one by one. So we need to do this in
several steps.
A hack in order to connect() on the "RARP" message should be easy to
enable, I left a couple of comments in that sense.
This is very much draft quality, but I tested the whole flow, and it
works for me. Window parameters and MSS match, too.
[dwg: Assorted cleanups]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Message-ID: <20250205230919.205302-6-sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
flow.c | 201 +++++++++++++++++++++++
flow.h | 6 +
migrate.c | 11 ++
repair.c | 1 -
tcp.c | 461 +++++++++++++++++++++++++++++++++++++++++++++++++++++
tcp_conn.h | 60 +++++++
6 files changed, 739 insertions(+), 1 deletion(-)
diff --git a/flow.c b/flow.c
index a6fe6d1f..aa9bf419 100644
--- a/flow.c
+++ b/flow.c
@@ -19,6 +19,7 @@
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
+#include "repair.h"
const char *flow_state_str[] = {
[FLOW_STATE_FREE] = "FREE",
@@ -874,6 +875,206 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
*last_next = FLOW_MAX;
}
+/**
+ * flow_migrate_source_pre_do() - Prepare/"unprepare" source flows for migration
+ * @c: Execution context
+ * @stage: Migration stage information (unused)
+ * @fd: Migration fd (unused)
+ * @rollback: If true, undo preparation
+ *
+ * Return: 0 on success, error code on failure
+ */
+static int flow_migrate_source_pre_do(struct ctx *c,
+ const struct migrate_stage *stage, int fd,
+ bool rollback)
+{
+ unsigned i, max_i;
+ int rc;
+
+ (void)stage;
+ (void)fd;
+
+ if (rollback) {
+ rc = 0;
+ i = FLOW_MAX;
+ goto rollback;
+ }
+
+ for (i = 0; i < FLOW_MAX; i++) { /* TODO: iterator with skip */
+ union flow *flow = &flowtab[i];
+
+ if (flow->f.state == FLOW_STATE_FREE)
+ i += flow->free.n - 1;
+ else if (flow->f.state == FLOW_STATE_ACTIVE &&
+ flow->f.type == FLOW_TCP)
+ rc = tcp_flow_repair_on(c, &flow->tcp);
+
+ if (rc) {
+ debug("Can't set repair mode for TCP flows, roll back");
+ goto rollback;
+ }
+ }
+
+ if ((rc = repair_flush(c))) { /* TODO: move to TCP logic */
+ debug("Can't set repair mode for TCP flows, roll back");
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ max_i = i;
+
+ for (i = 0; i < max_i; i++) { /* TODO: iterator with skip */
+ union flow *flow = &flowtab[i];
+
+ if (flow->f.state == FLOW_STATE_FREE)
+ i += flow->free.n - 1;
+ else if (flow->f.state == FLOW_STATE_ACTIVE &&
+ flow->f.type == FLOW_TCP)
+ tcp_flow_repair_off(c, &flow->tcp);
+ }
+
+ repair_flush(c);
+
+ return rc;
+}
+
+/**
+ * flow_migrate_source_pre() - Prepare source flows for migration
+ * @c: Execution context
+ * @stage: Migration stage information (unused)
+ * @fd: Migration fd (unused)
+ * @rollback: If true, undo preparation
+ *
+ * Return: 0 on success, error code on failure
+ */
+int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
+ int fd)
+{
+ return flow_migrate_source_pre_do(c, stage, fd, false);
+}
+
+/**
+ * flow_migrate_source() - Dump additional information and send data
+ * @c: Execution context
+ * @stage: Migration stage information (unused)
+ * @fd: Migration fd
+ *
+ * Return: 0 on success
+ */
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
+ int fd)
+{
+ uint32_t count = 0;
+ unsigned i;
+ int rc;
+
+ for (i = 0; i < FLOW_MAX; i++) { /* TODO: iterator with skip */
+ union flow *flow = &flowtab[i];
+
+ if (flow->f.state == FLOW_STATE_FREE)
+ i += flow->free.n - 1;
+ else if (flow->f.state == FLOW_STATE_ACTIVE &&
+ flow->f.type == FLOW_TCP)
+ count++;
+ }
+
+ count = htonl(count);
+ rc = write_all_buf(fd, &count, sizeof(count));
+ if (rc) {
+ rc = errno;
+ err("Can't send flow count (%u): %s, abort",
+ ntohl(count), strerror_(errno));
+ return rc;
+ }
+ debug("Sending %u flows", ntohl(count));
+
+ /* Send information that can be stored in the flow table, first */
+ for (i = 0; i < FLOW_MAX; i++) { /* TODO: iterator with skip */
+ union flow *flow = &flowtab[i];
+
+ if (flow->f.state == FLOW_STATE_FREE) {
+ i += flow->free.n - 1;
+ } else if (flow->f.state == FLOW_STATE_ACTIVE &&
+ flow->f.type == FLOW_TCP) {
+ rc = tcp_flow_migrate_source(fd, &flow->tcp);
+ if (rc)
+ goto rollback;
+ }
+ /* TODO: other protocols */
+ }
+
+ /* And then "extended" data: the target needs to set repair mode on
+ * sockets before it can set this stuff, but it needs sockets (and
+ * flows) for that.
+ */
+ for (i = 0; i < FLOW_MAX; i++) { /* TODO: iterator with skip */
+ union flow *flow = &flowtab[i];
+
+ if (flow->f.state == FLOW_STATE_FREE) {
+ i += flow->free.n - 1;
+ } else if (flow->f.state == FLOW_STATE_ACTIVE &&
+ flow->f.type == FLOW_TCP) {
+ rc = tcp_flow_migrate_source_ext(fd, &flow->tcp);
+ if (rc)
+ goto rollback;
+ }
+ /* TODO: other protocols */
+ }
+
+ return 0;
+
+rollback:
+ flow_migrate_source_pre_do(c, stage, fd, true);
+ return rc;
+}
+
+/**
+ * flow_migrate_target() - Receive flows and insert in flow table
+ * @c: Execution context
+ * @stage: Migration stage information (unused)
+ * @fd: Migration fd
+ *
+ * Return: 0 on success
+ */
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
+ int fd)
+{
+ uint32_t count;
+ unsigned i;
+ int rc;
+
+ (void)stage;
+
+ /* TODO: error handling */
+
+ if (read_all_buf(fd, &count, sizeof(count)))
+ return errno;
+
+ count = ntohl(count);
+ debug("Receiving %u flows", count);
+
+ /* TODO: flow header with type, instead? */
+ for (i = 0; i < count; i++) {
+ rc = tcp_flow_migrate_target(c, fd);
+ if (rc)
+ return rc;
+ }
+
+ repair_flush(c);
+
+ for (i = 0; i < count; i++) {
+ rc = tcp_flow_migrate_target_ext(c, flowtab + i, fd);
+ if (rc)
+ return rc;
+ }
+
+ repair_flush(c);
+
+ return 0;
+}
+
/**
* flow_init() - Initialise flow related data structures
*/
diff --git a/flow.h b/flow.h
index 24ba3ef0..a485c359 100644
--- a/flow.h
+++ b/flow.h
@@ -249,6 +249,12 @@ union flow;
void flow_init(void);
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
+int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
+ int fd);
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
+ int fd);
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
+ int fd);
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
__attribute__((format(printf, 3, 4)));
diff --git a/migrate.c b/migrate.c
index 9deee7ad..d091e90d 100644
--- a/migrate.c
+++ b/migrate.c
@@ -80,6 +80,16 @@ static const struct migrate_stage stages_v1[] = {
.source = seen_addrs_source_v1,
.target = seen_addrs_target_v1,
},
+ {
+ .name = "flow pre",
+ .source = flow_migrate_source_pre,
+ .target = NULL,
+ },
+ {
+ .name = "flow",
+ .source = flow_migrate_source,
+ .target = flow_migrate_target,
+ },
{ 0 },
};
@@ -208,6 +218,7 @@ static int migrate_target(struct ctx *c, int fd)
void migrate_init(struct ctx *c)
{
c->device_state_result = -1;
+ repair_sock_init(c);
}
/**
diff --git a/repair.c b/repair.c
index 61519279..b170f400 100644
--- a/repair.c
+++ b/repair.c
@@ -171,7 +171,6 @@ int repair_flush(struct ctx *c)
*
* Return: 0 on success, negative error code on failure
*/
-/* cppcheck-suppress unusedFunction */
int repair_set(struct ctx *c, int s, int cmd)
{
int rc;
diff --git a/tcp.c b/tcp.c
index af6bd95a..885ba3af 100644
--- a/tcp.c
+++ b/tcp.c
@@ -299,6 +299,7 @@
#include "log.h"
#include "inany.h"
#include "flow.h"
+#include "repair.h"
#include "linux_dep.h"
#include "flow_table.h"
@@ -326,6 +327,11 @@
((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set))
+#define TCP_MIGRATE_SND_QUEUE_MAX (16 << 20)
+#define TCP_MIGRATE_RCV_QUEUE_MAX (16 << 20)
+uint8_t tcp_migrate_snd_queue[TCP_MIGRATE_SND_QUEUE_MAX];
+uint8_t tcp_migrate_rcv_queue[TCP_MIGRATE_RCV_QUEUE_MAX];
+
static const char *tcp_event_str[] __attribute((__unused__)) = {
"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
@@ -2645,3 +2651,458 @@ void tcp_timer(struct ctx *c, const struct timespec *now)
if (c->mode == MODE_PASTA)
tcp_splice_refill(c);
}
+
+/**
+ * tcp_flow_repair_on() - Enable repair mode for a single TCP flow
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+ int rc = 0;
+
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON)))
+ err("Failed to set TCP_REPAIR");
+
+ return rc;
+}
+
+/**
+ * tcp_flow_repair_off() - Clear repair mode for a single TCP flow
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+ int rc = 0;
+
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+ err("Failed to clear TCP_REPAIR");
+
+ return rc;
+}
+
+/**
+ * tcp_flow_repair_queues() - Dump or set sequences, read or write socket queues
+ * @s: Socket
+ * @snd_seq: Send sequence, set on return if @set == false, network order
+ * @snd_buf: Send queue buffer, read or written depending on @set
+ * @snd_len: Length of send queue buffer, network order
+ * @rcv_seq: Receive sequence, set on return if @set == false, network order
+ * @rcv_buf: Receive queue buffer, read or written depending on @set
+ * @rcv_len: Length of receive queue buffer, network order
+ * @set: Set if true, dump if false
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_queues(int s,
+ uint32_t *snd_seq, uint8_t *snd_buf,
+ uint32_t *snd_len,
+ uint32_t *rcv_seq, uint8_t *rcv_buf,
+ uint32_t *rcv_len, bool set)
+{
+ socklen_t vlen = sizeof(uint32_t);
+ int v;
+
+ /* TODO: proper error management and prints */
+
+ v = TCP_SEND_QUEUE;
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
+ return -errno;
+
+ if (set) {
+ uint8_t *p;
+
+ *snd_seq = ntohl(*snd_seq);
+ if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, snd_seq, vlen))
+ return -errno;
+ debug("Set sending sequence for socket %i to %u", s, *snd_seq);
+
+ debug("Writing socket %i send queue: %u bytes", s, *snd_len);
+ p = snd_buf;
+ while (*snd_len > 0) {
+ ssize_t rc = send(s, p, *snd_len, 0);
+ if (rc < 0)
+ return rc;
+
+ snd_len -= rc;
+ p += rc;
+ }
+ } else {
+ ssize_t rc;
+
+ if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, snd_seq, &vlen))
+ return -errno;
+ debug("Dumped sending sequence for socket %i: %u", s, *snd_seq);
+ *snd_seq = htonl(*snd_seq);
+
+ rc = recv(s, snd_buf, TCP_MIGRATE_SND_QUEUE_MAX, MSG_PEEK);
+ if (rc < 0 && errno != EAGAIN) { /* FIXME: EAGAIN expected? */
+ err_perror("Can't read send queue for socket %i", s);
+ return rc;
+ }
+
+ *snd_len = htonl((rc < 0) ? 0 : rc);
+ debug("Read socket %i send queue: %zi bytes", s, rc);
+ }
+
+ v = TCP_RECV_QUEUE;
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
+ return -errno;
+
+ if (set) {
+ uint8_t *p;
+
+ *rcv_seq = ntohl(*rcv_seq);
+ if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, rcv_seq, vlen))
+ return -errno;
+ debug("Set receive sequence for socket %i to %u", s, *rcv_seq);
+
+ debug("Writing socket %i receive queue: %u bytes", s, *rcv_len);
+ p = rcv_buf;
+ while (*rcv_len > 0) {
+ ssize_t rc = send(s, p, *rcv_len, 0);
+ if (rc < 0)
+ return rc;
+
+ rcv_len -= rc;
+ p += rc;
+ }
+ } else {
+ ssize_t rc;
+
+ if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, rcv_seq, &vlen))
+ return -errno;
+ debug("Dumped receive sequence for socket %i: %u", s, *rcv_seq);
+ *rcv_seq = htonl(*rcv_seq);
+
+ rc = recv(s, rcv_buf, TCP_MIGRATE_RCV_QUEUE_MAX, MSG_PEEK);
+ if (rc < 0 && errno != EAGAIN) { /* FIXME: EAGAIN expected? */
+ err_perror("Can't read receive queue for socket %i", s);
+ return rc;
+ }
+
+ *rcv_len = htonl((rc < 0) ? 0 : rc);
+ debug("Read socket %i receive queue: %zi bytes", s, rc);
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_opt() - Dump or set repair "options" (MSS and window scale)
+ * @s: Socket
+ * @ws_to_sock: Window scaling factor from us, network order
+ * @ws_from_sock: Window scaling factor from peer, network order
+ * @mss: Maximum Segment Size, socket side, network order
+ * @set: Set if true, dump if false
+ *
+ * Return: 0 on success, TODO: negative error code on failure
+ */
+int tcp_flow_repair_opt(int s, uint8_t *ws_to_sock, uint8_t *ws_from_sock,
+ uint32_t *mss, bool set)
+{
+ struct tcp_info_linux tinfo;
+ struct tcp_repair_opt opts[2];
+ socklen_t sl;
+
+ opts[0].opt_code = TCPOPT_WINDOW;
+ opts[1].opt_code = TCPOPT_MAXSEG;
+
+ if (set) {
+ opts[0].opt_val = *ws_to_sock + (*ws_from_sock << 16);
+ opts[1].opt_val = ntohl(*mss);
+
+ sl = sizeof(opts);
+ setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl);
+ } else {
+ sl = sizeof(tinfo);
+ getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl);
+
+ *ws_to_sock = tinfo.tcpi_snd_wscale;
+ *ws_from_sock = tinfo.tcpi_rcv_wscale;
+ *mss = htonl(tinfo.tcpi_snd_mss);
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_wnd() - Dump or set window parameters
+ * @snd_wl1: See struct tcp_repair_window
+ * @snd_wnd: Socket-side sending window, network order
+ * @max_window: Window clamp, network order
+ * @rcv_wnd: Socket-side receive window, network order
+ * @rcv_wup: See struct tcp_repair_window
+ * @set: Set if true, dump if false
+ *
+ * Return: 0 on success, TODO: negative error code on failure
+ */
+int tcp_flow_repair_wnd(int s, uint32_t *snd_wl1, uint32_t *snd_wnd,
+ uint32_t *max_window, uint32_t *rcv_wnd,
+ uint32_t *rcv_wup, bool set)
+{
+ struct tcp_repair_window wnd;
+ socklen_t sl = sizeof(wnd);
+
+ if (set) {
+ wnd.snd_wl1 = ntohl(*snd_wl1);
+ wnd.snd_wnd = ntohl(*snd_wnd);
+ wnd.max_window = ntohl(*max_window);
+ wnd.rcv_wnd = ntohl(*rcv_wnd);
+ wnd.rcv_wup = ntohl(*rcv_wup);
+
+ setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sl);
+ } else {
+ getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl);
+
+ *snd_wl1 = htonl(wnd.snd_wl1);
+ *snd_wnd = htonl(wnd.snd_wnd);
+ *max_window = htonl(wnd.max_window);
+ *rcv_wnd = htonl(wnd.rcv_wnd);
+ *rcv_wup = htonl(wnd.rcv_wup);
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_source() - Send data (flow table part) for a single flow
+ * @c: Execution context
+ * @fd: Descriptor for state migration
+ * @conn: Pointer to the TCP connection structure
+ */
+int tcp_flow_migrate_source(int fd, const struct tcp_tap_conn *conn)
+{
+ struct tcp_tap_transfer t = {
+ .retrans = conn->retrans,
+ .ws_from_tap = conn->ws_from_tap,
+ .ws_to_tap = conn->ws_to_tap,
+ .events = conn->events,
+
+ .tap_mss = htonl(MSS_GET(conn)),
+
+ .sndbuf = htonl(conn->sndbuf),
+
+ .flags = conn->flags,
+ .seq_dup_ack_approx = conn->seq_dup_ack_approx,
+
+ .wnd_from_tap = htons(conn->wnd_from_tap),
+ .wnd_to_tap = htons(conn->wnd_to_tap),
+
+ .seq_to_tap = htonl(conn->seq_to_tap),
+ .seq_ack_from_tap = htonl(conn->seq_ack_from_tap),
+ .seq_from_tap = htonl(conn->seq_from_tap),
+ .seq_ack_to_tap = htonl(conn->seq_ack_to_tap),
+ .seq_init_from_tap = htonl(conn->seq_init_from_tap),
+ };
+
+ memcpy(&t.pif, conn->f.pif, sizeof(t.pif));
+ memcpy(&t.side, conn->f.side, sizeof(t.side));
+
+ if (write_all_buf(fd, &t, sizeof(t)))
+ return errno;
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_source_ext() - Send extended data for a single flow
+ * @fd: Descriptor for state migration
+ * @conn: Pointer to the TCP connection structure
+ */
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
+{
+ struct tcp_tap_transfer_ext t;
+ int s = conn->sock;
+ int rc;
+
+ rc = tcp_flow_repair_queues(s,
+ &t.sock_seq_snd, tcp_migrate_snd_queue,
+ &t.sndlen,
+ &t.sock_seq_rcv, tcp_migrate_rcv_queue,
+ &t.rcvlen, false);
+ if (rc)
+ return rc;
+
+ tcp_flow_repair_opt(s, &t.ws_to_sock, &t.ws_from_sock, &t.sock_mss,
+ false);
+
+ tcp_flow_repair_wnd(s, &t.sock_snd_wl1, &t.sock_snd_wnd,
+ &t.sock_max_window, &t.sock_rcv_wnd,
+ &t.sock_rcv_wup, false);
+
+ if (write_all_buf(fd, &t, sizeof(t)))
+ return errno;
+
+ if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t.sndlen)))
+ return errno;
+
+ if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t.rcvlen)))
+ return errno;
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+ const struct flowside *sockside = HOSTFLOW(conn);
+ struct sockaddr_in a;
+ int rc;
+
+ a = (struct sockaddr_in){ af, htons(sockside->oport), { 0 }, { 0 } };
+
+ if ((conn->sock = socket(af, SOCK_STREAM, IPPROTO_TCP)) < 0)
+ return -errno;
+
+ /* On the same host, source socket can be in TIME_WAIT */
+ setsockopt(conn->sock, SOL_SOCKET, SO_REUSEADDR,
+ &((int){ 1 }), sizeof(int));
+
+ /* TODO: switch to tcp_bind_outbound(c, conn, conn->sock); ...? */
+ if (bind(conn->sock, (struct sockaddr *)&a, sizeof(a)) < 0) {
+ close(conn->sock);
+ conn->sock = -1;
+ return -errno;
+ }
+
+ rc = tcp_flow_repair_on(c, conn);
+ if (rc) {
+ close(conn->sock);
+ conn->sock = -1;
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_connect(const struct ctx *c, struct tcp_tap_conn *conn)
+{
+ const struct flowside *tgt = &conn->f.side[TGTSIDE];
+ int rc;
+
+ rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
+ if (rc) {
+ err("Failed to connect repaired socket: %s", strerror_(errno));
+ return rc;
+ }
+
+ conn->in_epoll = 0;
+ conn->timer = -1;
+ tcp_epoll_ctl(c, conn);
+
+ return 0;
+
+ /* HACK RARP: return tcp_flow_repair_off(c, conn); */
+}
+
+/**
+ * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert
+ * @c: Execution context
+ * @fd: Descriptor for state migration
+ */
+int tcp_flow_migrate_target(struct ctx *c, int fd)
+{
+ struct tcp_tap_transfer t;
+ struct tcp_tap_conn *conn;
+ union flow *flow;
+
+ if (!(flow = flow_alloc()))
+ return -ENOMEM;
+
+ if (read_all_buf(fd, &t, sizeof(t)))
+ return errno;
+
+ flow->f.state = FLOW_STATE_TGT;
+ memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif));
+ memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
+ conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
+
+ conn->retrans = t.retrans;
+ conn->ws_from_tap = t.ws_from_tap;
+ conn->ws_to_tap = t.ws_to_tap;
+ conn->events = t.events;
+
+ conn->sndbuf = htonl(t.sndbuf);
+
+ conn->flags = t.flags;
+ conn->seq_dup_ack_approx = t.seq_dup_ack_approx;
+
+ MSS_SET(conn, ntohl(t.tap_mss));
+
+ conn->wnd_from_tap = ntohs(t.wnd_from_tap);
+ conn->wnd_to_tap = ntohs(t.wnd_to_tap);
+
+ conn->seq_to_tap = ntohl(t.seq_to_tap);
+ conn->seq_ack_from_tap = ntohl(t.seq_ack_from_tap);
+ conn->seq_from_tap = ntohl(t.seq_from_tap);
+ conn->seq_ack_to_tap = ntohl(t.seq_ack_to_tap);
+ conn->seq_init_from_tap = ntohl(t.seq_init_from_tap);
+
+ tcp_flow_repair_socket(c, conn);
+
+ flow_hash_insert(c, TAP_SIDX(conn));
+ FLOW_ACTIVATE(conn);
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect
+ * @c: Execution context
+ * @flow: Existing flow for this connection data
+ * @fd: Descriptor for state migration
+ */
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
+{
+ struct tcp_tap_conn *conn = &flow->tcp;
+ struct tcp_tap_transfer_ext t;
+ int s = conn->sock;
+
+ if (read_all_buf(fd, &t, sizeof(t)))
+ return errno;
+
+ if (read_all_buf(fd, tcp_migrate_snd_queue, ntohl(t.sndlen)))
+ return errno;
+
+ if (read_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t.rcvlen)))
+ return errno;
+
+ tcp_flow_repair_queues(s,
+ &t.sock_seq_snd, tcp_migrate_snd_queue,
+ &t.sndlen,
+ &t.sock_seq_rcv, tcp_migrate_rcv_queue,
+ &t.rcvlen, true);
+
+ tcp_flow_repair_connect(c, conn);
+
+ tcp_flow_repair_opt(s, &t.ws_to_sock, &t.ws_from_sock, &t.sock_mss,
+ true);
+
+ tcp_flow_repair_wnd(s, &t.sock_snd_wl1, &t.sock_snd_wnd,
+ &t.sock_max_window, &t.sock_rcv_wnd,
+ &t.sock_rcv_wup, true);
+
+ tcp_flow_repair_off(c, conn);
+
+ return 0;
+}
diff --git a/tcp_conn.h b/tcp_conn.h
index d3426808..aba8e914 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -96,6 +96,60 @@ struct tcp_tap_conn {
uint32_t seq_init_from_tap;
};
+/**
+ * struct tcp_tap_transfer - TCP data to migrate (flow table part only)
+ * TODO
+ */
+struct tcp_tap_transfer {
+ uint8_t pif[SIDES];
+ struct flowside side[SIDES];
+
+ uint8_t retrans;
+ uint8_t ws_from_tap;
+ uint8_t ws_to_tap;
+ uint8_t events;
+
+ uint32_t tap_mss;
+
+ uint32_t sndbuf;
+
+ uint8_t flags;
+ uint8_t seq_dup_ack_approx;
+
+ uint16_t wnd_from_tap;
+ uint16_t wnd_to_tap;
+
+ uint32_t seq_to_tap;
+ uint32_t seq_ack_from_tap;
+ uint32_t seq_from_tap;
+ uint32_t seq_ack_to_tap;
+ uint32_t seq_init_from_tap;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
+/**
+ * struct tcp_tap_transfer_ext - TCP data to migrate (not stored in flow table)
+ * TODO
+ */
+struct tcp_tap_transfer_ext {
+ uint32_t sock_seq_snd;
+ uint32_t sock_seq_rcv;
+
+ uint32_t sndlen;
+ uint32_t rcvlen;
+
+ uint32_t sock_mss;
+
+ /* We can't just use struct tcp_repair_window: we need network order */
+ uint32_t sock_snd_wl1;
+ uint32_t sock_snd_wnd;
+ uint32_t sock_max_window;
+ uint32_t sock_rcv_wnd;
+ uint32_t sock_rcv_wup;
+
+ uint8_t ws_to_sock;
+ uint8_t ws_from_sock;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
/**
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
* @f: Generic flow information
@@ -140,6 +194,12 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_target(struct ctx *c, int fd);
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
--
@@ -96,6 +96,60 @@ struct tcp_tap_conn {
uint32_t seq_init_from_tap;
};
+/**
+ * struct tcp_tap_transfer - TCP data to migrate (flow table part only)
+ * TODO
+ */
+struct tcp_tap_transfer {
+ uint8_t pif[SIDES];
+ struct flowside side[SIDES];
+
+ uint8_t retrans;
+ uint8_t ws_from_tap;
+ uint8_t ws_to_tap;
+ uint8_t events;
+
+ uint32_t tap_mss;
+
+ uint32_t sndbuf;
+
+ uint8_t flags;
+ uint8_t seq_dup_ack_approx;
+
+ uint16_t wnd_from_tap;
+ uint16_t wnd_to_tap;
+
+ uint32_t seq_to_tap;
+ uint32_t seq_ack_from_tap;
+ uint32_t seq_from_tap;
+ uint32_t seq_ack_to_tap;
+ uint32_t seq_init_from_tap;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
+/**
+ * struct tcp_tap_transfer_ext - TCP data to migrate (not stored in flow table)
+ * TODO
+ */
+struct tcp_tap_transfer_ext {
+ uint32_t sock_seq_snd;
+ uint32_t sock_seq_rcv;
+
+ uint32_t sndlen;
+ uint32_t rcvlen;
+
+ uint32_t sock_mss;
+
+ /* We can't just use struct tcp_repair_window: we need network order */
+ uint32_t sock_snd_wl1;
+ uint32_t sock_snd_wnd;
+ uint32_t sock_max_window;
+ uint32_t sock_rcv_wnd;
+ uint32_t sock_rcv_wup;
+
+ uint8_t ws_to_sock;
+ uint8_t ws_from_sock;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
/**
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
* @f: Generic flow information
@@ -140,6 +194,12 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
+int tcp_flow_migrate_target(struct ctx *c, int fd);
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v10 09/10] fixup: Reset SO_PEEK_OFF value after incoming migration
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
` (7 preceding siblings ...)
2025-02-06 5:49 ` [PATCH v10 08/10] migrate: Migrate TCP flows David Gibson
@ 2025-02-06 5:49 ` David Gibson
2025-02-06 5:49 ` [PATCH v10 10/10] test: Add migrate/basic tests David Gibson
9 siblings, 0 replies; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
TCP code needs to do different things depending on whether SO_PEEK_OFF
is enabled on the socket, and it assumes it is already set if it is
supported. However, we didn't re-enable it on sockets created for an
incoming migration.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
tcp.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/tcp.c b/tcp.c
index 885ba3af..5b389ef8 100644
--- a/tcp.c
+++ b/tcp.c
@@ -3076,6 +3076,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
{
struct tcp_tap_conn *conn = &flow->tcp;
struct tcp_tap_transfer_ext t;
+ uint32_t peek_offset;
int s = conn->sock;
if (read_all_buf(fd, &t, sizeof(t)))
@@ -3104,5 +3105,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
tcp_flow_repair_off(c, conn);
+ peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+ if (tcp_set_peek_offset(conn->sock, peek_offset))
+ tcp_rst(c, conn);
+
return 0;
}
--
@@ -3076,6 +3076,7 @@ int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
{
struct tcp_tap_conn *conn = &flow->tcp;
struct tcp_tap_transfer_ext t;
+ uint32_t peek_offset;
int s = conn->sock;
if (read_all_buf(fd, &t, sizeof(t)))
@@ -3104,5 +3105,9 @@ int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
tcp_flow_repair_off(c, conn);
+ peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+ if (tcp_set_peek_offset(conn->sock, peek_offset))
+ tcp_rst(c, conn);
+
return 0;
}
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [PATCH v10 10/10] test: Add migrate/basic tests
2025-02-06 5:49 [PATCH v10 00/10] Draft state migration David Gibson
` (8 preceding siblings ...)
2025-02-06 5:49 ` [PATCH v10 09/10] fixup: Reset SO_PEEK_OFF value after incoming migration David Gibson
@ 2025-02-06 5:49 ` David Gibson
9 siblings, 0 replies; 12+ messages in thread
From: David Gibson @ 2025-02-06 5:49 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
From: Stefano Brivio <sbrivio@redhat.com>
PCAP=1 ./run migrate/basic is oddly satisfying.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Message-ID: <20250205230919.205302-7-sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
test/lib/layout | 55 +++++++++++++++++++-
test/lib/setup | 127 +++++++++++++++++++++++++++++++++++++++++++++
test/lib/test | 3 ++
test/migrate/basic | 54 +++++++++++++++++++
test/run | 4 ++
5 files changed, 242 insertions(+), 1 deletion(-)
create mode 100644 test/migrate/basic
diff --git a/test/lib/layout b/test/lib/layout
index 4d035728..fddcdc4a 100644
--- a/test/lib/layout
+++ b/test/lib/layout
@@ -134,6 +134,54 @@ layout_two_guests() {
get_info_cols
+ pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
+ pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #1" qemu_2 guest_2
+
+ tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done'
+ tmux send-keys -t ${PANE_INFO} -N 100 C-m
+ tmux select-pane -t ${PANE_INFO} -T "test log"
+
+ pane_watch_contexts ${PANE_HOST} host host
+ pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
+ pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #1" pasta_1 passt_2
+
+ info_layout "two guests, two passt instances, in namespaces"
+
+ sleep 1
+}
+
+# layout_migrate() - Two guest panes, two passt panes, two passt-repair panes,
+# plus host and log
+layout_migrate() {
+ sleep 1
+
+ tmux kill-pane -a -t 0
+ cmd_write 0 clear
+
+ tmux split-window -v -t passt_test
+ tmux split-window -h -l '33%'
+ tmux split-window -h -t passt_test:1.1
+
+ tmux split-window -h -l '35%' -t passt_test:1.0
+ tmux split-window -v -t passt_test:1.0
+
+ tmux split-window -v -t passt_test:1.4
+ tmux split-window -v -t passt_test:1.6
+
+ tmux split-window -v -t passt_test:1.3
+
+ PANE_GUEST_1=0
+ PANE_GUEST_2=1
+ PANE_INFO=2
+ PANE_MON=3
+ PANE_HOST=4
+ PANE_PASST_REPAIR_1=5
+ PANE_PASST_1=6
+ PANE_PASST_REPAIR_2=7
+ PANE_PASST_2=8
+
+ get_info_cols
+
pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2
@@ -141,11 +189,16 @@ layout_two_guests() {
tmux send-keys -t ${PANE_INFO} -N 100 C-m
tmux select-pane -t ${PANE_INFO} -T "test log"
+ pane_watch_contexts ${PANE_MON} "QEMU monitor" mon mon
+
pane_watch_contexts ${PANE_HOST} host host
+ pane_watch_contexts ${PANE_PASST_REPAIR_1} "passt-repair #1 in namespace #1" repair_1 passt_repair_1
pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
+
+ pane_watch_contexts ${PANE_PASST_REPAIR_2} "passt-repair #2 in namespace #2" repair_2 passt_repair_2
pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2
- info_layout "two guests, two passt instances, in namespaces"
+ info_layout "two guests, two passt + passt-repair instances, in namespaces"
sleep 1
}
diff --git a/test/lib/setup b/test/lib/setup
index 580825f1..36aa0c41 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -305,6 +305,110 @@ setup_two_guests() {
context_setup_guest guest_2 ${GUEST_2_CID}
}
+# setup_migrate() - Set up two namespace, run qemu, passt/passt-repair in both
+setup_migrate() {
+ context_setup_host host
+ context_setup_host mon
+ context_setup_host pasta_1
+ context_setup_host pasta_2
+
+ layout_migrate
+
+ # Ports:
+ #
+ # guest #1 | guest #2 | ns #1 | host
+ # --------- |-----------|-----------|------------
+ # 10001 as server | | to guest | to ns #1
+ # 10002 | | as server | to ns #1
+ # 10003 | | to init | as server
+ # 10004 | as server | to guest | to ns #1
+
+ __opts=
+ [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/pasta_1.pcap"
+ [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
+ [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+ # Option 1: send stuff via spliced path in pasta
+ # context_run_bg pasta_1 "./pasta ${__opts} --trace -l /tmp/pasta1.log -P ${STATESETUP}/pasta_1.pid -t 10001,10002 -T 10003 -u 10001,10002 -U 10003 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
+ # Option 2: send stuff via tap (--map-guest-addr) instead (useful to see capture of full migration)
+ context_run_bg pasta_1 "./pasta ${__opts} --trace -l /tmp/pasta1.log -P ${STATESETUP}/pasta_1.pid -t 10001,10002,10004 -T 10003 -u 10001,10002,10004 -U 10003 --map-guest-addr 169.254.1.1 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
+ context_setup_nstool passt_1 ${STATESETUP}/ns1.hold
+ context_setup_nstool passt_repair_1 ${STATESETUP}/ns1.hold
+
+ context_setup_nstool passt_2 ${STATESETUP}/ns1.hold
+ context_setup_nstool passt_repair_2 ${STATESETUP}/ns1.hold
+
+ context_setup_nstool qemu_1 ${STATESETUP}/ns1.hold
+ context_setup_nstool qemu_2 ${STATESETUP}/ns1.hold
+
+ __ifname="$(context_run qemu_1 "ip -j link show | jq -rM '.[] | select(.link_type == \"ether\").ifname'")"
+
+ sleep 1
+
+ __opts="--vhost-user"
+ [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
+ [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
+ [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+
+ context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
+ wait_for [ -f "${STATESETUP}/passt_1.pid" ]
+
+ context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
+
+ __opts="--vhost-user"
+ [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
+ [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
+ [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+
+ context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
+ wait_for [ -f "${STATESETUP}/passt_2.pid" ]
+
+ context_run_bg passt_repair_2 "./passt-repair ${STATESETUP}/passt_2.socket.repair"
+
+ __vmem="512M" # Keep migration fast
+ __qemu_netdev1=" \
+ -chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
+ -netdev vhost-user,id=v,chardev=c \
+ -device virtio-net,netdev=v \
+ -object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+ -numa node,memdev=m"
+ __qemu_netdev2=" \
+ -chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
+ -netdev vhost-user,id=v,chardev=c \
+ -device virtio-net,netdev=v \
+ -object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+ -numa node,memdev=m"
+
+ GUEST_1_CID=94557
+ context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
+ ' -M accel=kvm:tcg' \
+ ' -m '${__vmem}' -cpu host -smp '${VCPUS} \
+ ' -kernel '"${KERNEL}" \
+ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \
+ ' -nodefaults' \
+ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \
+ " ${__qemu_netdev1}" \
+ " -pidfile ${STATESETUP}/qemu_1.pid" \
+ " -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" \
+ " -monitor unix:${STATESETUP}/qemu_1_mon.sock,server,nowait"
+
+ GUEST_2_CID=94558
+ context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
+ ' -M accel=kvm:tcg' \
+ ' -m '${__vmem}' -cpu host -smp '${VCPUS} \
+ ' -kernel '"${KERNEL}" \
+ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \
+ ' -nodefaults' \
+ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \
+ " ${__qemu_netdev2}" \
+ " -pidfile ${STATESETUP}/qemu_2.pid" \
+ " -device vhost-vsock-pci,guest-cid=$GUEST_2_CID" \
+ " -monitor unix:${STATESETUP}/qemu_2_mon.sock,server,nowait" \
+ " -incoming tcp:0:20005"
+
+ context_setup_guest guest_1 ${GUEST_1_CID}
+ # Only available after migration: context_setup_guest guest_2 ${GUEST_2_CID}
+}
+
# teardown_context_watch() - Remove contexts and stop panes watching them
# $1: Pane number watching
# $@: Context names
@@ -384,6 +488,29 @@ teardown_two_guests() {
teardown_context_watch ${PANE_PASST_2} pasta_2 passt_2
}
+# teardown_migrate() - Exit namespaces, kill qemu processes, passt and pasta
+teardown_migrate() {
+ ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_1.pid")
+ ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_2.pid")
+ context_wait qemu_1
+ context_wait qemu_2
+
+ ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/passt_2.pid")
+ context_wait passt_1
+ context_wait passt_2
+ ${NSTOOL} stop "${STATESETUP}/ns1.hold"
+ context_wait pasta_1
+
+ rm -f "${STATESETUP}/passt_[12].pid" "${STATESETUP}/pasta_[12].pid"
+
+ teardown_context_watch ${PANE_HOST} host
+
+ teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
+ teardown_context_watch ${PANE_GUEST_2} qemu_2 guest_2
+ teardown_context_watch ${PANE_PASST_1} pasta_1 passt_1
+ teardown_context_watch ${PANE_PASST_2} pasta_1 passt_2
+}
+
# teardown_demo_passt() - Exit namespace, kill qemu, passt and pasta
teardown_demo_passt() {
tmux send-keys -t ${PANE_GUEST} "C-c"
diff --git a/test/lib/test b/test/lib/test
index 91729af7..6e88562d 100755
--- a/test/lib/test
+++ b/test/lib/test
@@ -177,6 +177,9 @@ test_one_line() {
"guest2w")
pane_or_context_wait guest_2 || TEST_ONE_nok=1
;;
+ "mon")
+ pane_or_context_run mon "${__arg}" || TEST_ONE_nok=1
+ ;;
"ns")
pane_or_context_run ns "${__arg}" || TEST_ONE_nok=1
;;
diff --git a/test/migrate/basic b/test/migrate/basic
new file mode 100644
index 00000000..0aaddd40
--- /dev/null
+++ b/test/migrate/basic
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/migrate/basic - Check basic migration functionality
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools ip jq dhclient socat cat
+htools ip jq
+
+test Interface name
+g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check [ -n "__IFNAME1__" ]
+
+test DHCP: address
+guest1 ip link set dev __IFNAME1__ up
+guest1 /sbin/dhclient -4 __IFNAME1__
+g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check [ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1 /sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test TCP/IPv4: guest1/guest2 > host
+g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+hostb socat -u TCP4-LISTEN:10006 OPEN:msg,create,trunc
+sleep 1
+# Option 1: via spliced path in pasta, namespace to host
+# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
+# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
+guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:169.254.1.1:10006
+sleep 1
+
+mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+hostw
+hout MSG cat msg
+check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]
diff --git a/test/run b/test/run
index fc710475..e5dd2f56 100755
--- a/test/run
+++ b/test/run
@@ -130,6 +130,10 @@ run() {
test two_guests_vu/basic
teardown two_guests
+ setup migrate
+ test migrate/basic
+ teardown migrate
+
VALGRIND=0
VHOST_USER=0
#setup passt_in_ns
--
@@ -130,6 +130,10 @@ run() {
test two_guests_vu/basic
teardown two_guests
+ setup migrate
+ test migrate/basic
+ teardown migrate
+
VALGRIND=0
VHOST_USER=0
#setup passt_in_ns
--
2.48.1
^ permalink raw reply related [flat|nested] 12+ messages in thread