From mboxrd@z Thu Jan 1 00:00:00 1970 Received: by passt.top (Postfix, from userid 1000) id 52EDB5A0625; Tue, 28 Jan 2025 00:15:32 +0100 (CET) From: Stefano Brivio To: passt-dev@passt.top Subject: [PATCH 6/7] Introduce facilities for guest migration on top of vhost-user infrastructure Date: Tue, 28 Jan 2025 00:15:31 +0100 Message-ID: <20250127231532.672363-7-sbrivio@redhat.com> X-Mailer: git-send-email 2.43.0 In-Reply-To: <20250127231532.672363-1-sbrivio@redhat.com> References: <20250127231532.672363-1-sbrivio@redhat.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Message-ID-Hash: EXBPKZVXQ56W53UANZ5U55PE3UEAZ5JB X-Message-ID-Hash: EXBPKZVXQ56W53UANZ5U55PE3UEAZ5JB X-MailFrom: sbrivio@passt.top X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; emergency; loop; banned-address; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header CC: Laurent Vivier , David Gibson X-Mailman-Version: 3.3.8 Precedence: list List-Id: Development discussion and patches for passt Archived-At: Archived-At: List-Archive: List-Archive: List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: Add two sets (source or target) of three functions each for passt in vhost-user mode, triggered by activity on the file descriptor passed via VHOST_USER_PROTOCOL_F_DEVICE_STATE: - migrate_source_pre() and migrate_target_pre() are called to prepare for migration, before data is transferred - migrate_source() sends, and migrate_target() receives migration data - migrate_source_post() and migrate_target_post() are responsible for any post-migration task Callbacks are added to these functions with arrays of function pointers in migrate.c. Migration handlers are versioned. Versioned descriptions of data sections will be added to the data_versions array, which points to versioned iovec arrays. Version 1 is currently empty and will be filled in in subsequent patches. The source announces the data version to be used and informs the peer about endianness, and the size of void *, time_t, flow entries and flow hash table entries. The target checks if the version of the source is still supported. If it's not, it aborts the migration. Signed-off-by: Stefano Brivio --- Makefile | 12 +-- migrate.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++++++ migrate.h | 90 ++++++++++++++++++ passt.c | 2 +- vu_common.c | 122 ++++++++++++++++--------- vu_common.h | 2 +- 6 files changed, 438 insertions(+), 49 deletions(-) create mode 100644 migrate.c create mode 100644 migrate.h diff --git a/Makefile b/Makefile index 464eef1..1383875 100644 --- a/Makefile +++ b/Makefile @@ -38,8 +38,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ - ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ - tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ + ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \ + tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ vhost_user.c virtio.c vu_common.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -48,10 +48,10 @@ MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ - lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ - siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ - tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \ - virtio.h vu_common.h + lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \ + pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \ + tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \ + vhost_user.h virtio.h vu_common.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include \nint main(){int a=getrandom(0, 0, 0);} diff --git a/migrate.c b/migrate.c new file mode 100644 index 0000000..bee9653 --- /dev/null +++ b/migrate.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +/* PASST - Plug A Simple Socket Transport + * for qemu/UNIX domain socket mode + * + * PASTA - Pack A Subtle Tap Abstraction + * for network namespace/tap device mode + * + * migrate.c - Migration sections, layout, and routines + * + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio + */ + +#include +#include + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "inany.h" +#include "flow.h" +#include "flow_table.h" + +#include "migrate.h" + +/* Current version of migration data */ +#define MIGRATE_VERSION 1 + +/* Magic as we see it and as seen with reverse endianness */ +#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 +#define MIGRATE_MAGIC_SWAPPED 0xB0D1B10B1B1DBBB1 + +/* Migration header to send from source */ +static union migrate_header header = { + .magic = MIGRATE_MAGIC, + .version = htonl_constant(MIGRATE_VERSION), + .time_t_size = htonl_constant(sizeof(time_t)), + .flow_size = htonl_constant(sizeof(union flow)), + .flow_sidx_size = htonl_constant(sizeof(struct flow_sidx)), + .voidp_size = htonl_constant(sizeof(void *)), +}; + +/* Data sections for version 1 */ +static struct iovec sections_v1[] = { + { &header, sizeof(header) }, +}; + +/* Set of data versions */ +static struct migrate_data data_versions[] = { + { + 1, sections_v1, + }, + { 0 }, +}; + +/* Handlers to call in source before sending data */ +struct migrate_handler handlers_source_pre[] = { + { 0 }, +}; + +/* Handlers to call in source after sending data */ +struct migrate_handler handlers_source_post[] = { + { 0 }, +}; + +/* Handlers to call in target before receiving data with version 1 */ +struct migrate_handler handlers_target_pre_v1[] = { + { 0 }, +}; + +/* Handlers to call in target after receiving data with version 1 */ +struct migrate_handler handlers_target_post_v1[] = { + { 0 }, +}; + +/* Versioned sets of migration handlers */ +struct migrate_target_handlers target_handlers[] = { + { + 1, + handlers_target_pre_v1, + handlers_target_post_v1, + }, + { 0 }, +}; + +/** + * migrate_source_pre() - Pre-migration tasks as source + * @m: Migration metadata + * + * Return: 0 on success, error code on failure + */ +int migrate_source_pre(struct migrate_meta *m) +{ + struct migrate_handler *h; + + for (h = handlers_source_pre; h->fn; h++) { + int rc; + + if ((rc = h->fn(m, h->data))) + return rc; + } + + return 0; +} + +/** + * migrate_source() - Perform migration as source: send state to hypervisor + * @fd: Descriptor for state transfer + * @m: Migration metadata + * + * Return: 0 on success, error code on failure + */ +int migrate_source(int fd, const struct migrate_meta *m) +{ + static struct migrate_data *d; + unsigned count; + int rc; + + for (d = data_versions; d->v != MIGRATE_VERSION; d++); + + for (count = 0; d->sections[count].iov_len; count++); + + debug("Writing %u migration sections", count - 1 /* minus header */); + rc = write_remainder(fd, d->sections, count, 0); + if (rc < 0) + return errno; + + return 0; +} + +/** + * migrate_source_post() - Post-migration tasks as source + * @m: Migration metadata + * + * Return: 0 on success, error code on failure + */ +void migrate_source_post(struct migrate_meta *m) +{ + struct migrate_handler *h; + + for (h = handlers_source_post; h->fn; h++) + h->fn(m, h->data); +} + +/** + * migrate_target_read_header() - Set metadata in target from source header + * @fd: Descriptor for state transfer + * @m: Migration metadata, filled on return + * + * Return: 0 on success, error code on failure + */ +int migrate_target_read_header(int fd, struct migrate_meta *m) +{ + static struct migrate_data *d; + union migrate_header h; + + if (read_all_buf(fd, &h, sizeof(h))) + return errno; + + debug("Source magic: 0x%016" PRIx64 ", sizeof(void *): %u, version: %u", + h.magic, ntohl(h.voidp_size), ntohl(h.version)); + + for (d = data_versions; d->v != ntohl(h.version); d++); + if (!d->v) + return ENOTSUP; + m->v = d->v; + + if (h.magic == MIGRATE_MAGIC) + m->bswap = false; + else if (h.magic == MIGRATE_MAGIC_SWAPPED) + m->bswap = true; + else + return ENOTSUP; + + if (ntohl(h.voidp_size) == 4) + m->source_64b = false; + else if (ntohl(h.voidp_size) == 8) + m->source_64b = true; + else + return ENOTSUP; + + if (ntohl(h.time_t_size) == 4) + m->time_64b = false; + else if (ntohl(h.time_t_size) == 8) + m->time_64b = true; + else + return ENOTSUP; + + m->flow_size = ntohl(h.flow_size); + m->flow_sidx_size = ntohl(h.flow_sidx_size); + + return 0; +} + +/** + * migrate_target_pre() - Pre-migration tasks as target + * @m: Migration metadata + * + * Return: 0 on success, error code on failure + */ +int migrate_target_pre(struct migrate_meta *m) +{ + struct migrate_target_handlers *th; + struct migrate_handler *h; + + for (th = target_handlers; th->v != m->v && th->v; th++); + + for (h = th->pre; h->fn; h++) { + int rc; + + if ((rc = h->fn(m, h->data))) + return rc; + } + + return 0; +} + +/** + * migrate_target() - Perform migration as target: receive state from hypervisor + * @fd: Descriptor for state transfer + * @m: Migration metadata + * + * Return: 0 on success, error code on failure + * + * #syscalls:vu readv + */ +int migrate_target(int fd, const struct migrate_meta *m) +{ + static struct migrate_data *d; + unsigned cnt; + int rc; + + for (d = data_versions; d->v != m->v && d->v; d++); + + for (cnt = 0; d->sections[cnt + 1 /* skip header */].iov_len; cnt++); + + debug("Reading %u migration sections", cnt); + rc = read_remainder(fd, d->sections + 1, cnt, 0); + if (rc < 0) + return errno; + + return 0; +} + +/** + * migrate_target_post() - Post-migration tasks as target + * @m: Migration metadata + */ +void migrate_target_post(struct migrate_meta *m) +{ + struct migrate_target_handlers *th; + struct migrate_handler *h; + + for (th = target_handlers; th->v != m->v && th->v; th++); + + for (h = th->post; h->fn; h++) + h->fn(m, h->data); +} diff --git a/migrate.h b/migrate.h new file mode 100644 index 0000000..5582f75 --- /dev/null +++ b/migrate.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later + * Copyright (c) 2025 Red Hat GmbH + * Author: Stefano Brivio + */ + +#ifndef MIGRATE_H +#define MIGRATE_H + +/** + * struct migrate_meta - Migration metadata + * @v: Chosen migration data version, host order + * @bswap: Source has opposite endianness + * @peer_64b: Source uses 64-bit void * + * @time_64b: Source uses 64-bit time_t + * @flow_size: Size of union flow in source + * @flow_sidx_size: Size of struct flow_sidx in source + */ +struct migrate_meta { + uint32_t v; + bool bswap; + bool source_64b; + bool time_64b; + size_t flow_size; + size_t flow_sidx_size; +}; + +/** + * union migrate_header - Migration header from source + * @magic: 0xB1BB1D1B0BB1D1B0, host order + * @version: Source sends highest known, target aborts if unsupported + * @voidp_size: sizeof(void *), network order + * @time_t_size: sizeof(time_t), network order + * @flow_size: sizeof(union flow), network order + * @flow_sidx_size: sizeof(struct flow_sidx_t), network order + * @unused: Go figure + */ +union migrate_header { + struct { + uint64_t magic; + uint32_t version; + uint32_t voidp_size; + uint32_t time_t_size; + uint32_t flow_size; + uint32_t flow_sidx_size; + }; + uint8_t unused[65536]; +}; + +/** + * struct migrate_data - Data sections for given source version + * @v: Source version this applies to, host order + * @sections: Array of data sections, NULL-terminated + */ +struct migrate_data { + uint32_t v; + struct iovec *sections; +}; + +/** + * struct migrate_handler - Function to handle a specific data section + * @fn: Function pointer taking pointer to data section + * @data: Associated data section + */ +struct migrate_handler { + int (*fn)(struct migrate_meta *m, void *data); + void *data; +}; + +/** + * struct migrate_target_handlers - Versioned sets of migration target handlers + * @v: Source version this applies to, host order + * @pre: Set of functions to execute in target before data copy + * @post: Set of functions to execute in target after data copy + */ +struct migrate_target_handlers { + uint32_t v; + struct migrate_handler *pre; + struct migrate_handler *post; +}; + +int migrate_source_pre(struct migrate_meta *m); +int migrate_source(int fd, const struct migrate_meta *m); +void migrate_source_post(struct migrate_meta *m); + +int migrate_target_read_header(int fd, struct migrate_meta *m); +int migrate_target_pre(struct migrate_meta *m); +int migrate_target(int fd, const struct migrate_meta *m); +void migrate_target_post(struct migrate_meta *m); + +#endif /* MIGRATE_H */ diff --git a/passt.c b/passt.c index b1c8ab6..184d4e5 100644 --- a/passt.c +++ b/passt.c @@ -358,7 +358,7 @@ loop: vu_kick_cb(c.vdev, ref, &now); break; case EPOLL_TYPE_VHOST_MIGRATION: - vu_migrate(c.vdev, eventmask); + vu_migrate(&c, eventmask); break; default: /* Can't happen */ diff --git a/vu_common.c b/vu_common.c index f43d8ac..0c67bd0 100644 --- a/vu_common.c +++ b/vu_common.c @@ -5,6 +5,7 @@ * common_vu.c - vhost-user common UDP and TCP functions */ +#include #include #include #include @@ -17,6 +18,7 @@ #include "vhost_user.h" #include "pcap.h" #include "vu_common.h" +#include "migrate.h" #define VU_MAX_TX_BUFFER_NB 2 @@ -305,50 +307,88 @@ err: } /** - * vu_migrate() - Send/receive passt insternal state to/from QEMU - * @vdev: vhost-user device + * vu_migrate_source() - Migration as source, send state to hypervisor + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int vu_migrate_source(int fd) +{ + struct migrate_meta m; + int rc; + + if ((rc = migrate_source_pre(&m))) { + err("Source pre-migration failed: %s, abort", strerror_(rc)); + return rc; + } + + debug("Saving backend state"); + + rc = migrate_source(fd, &m); + if (rc) + err("Source migration failed: %s", strerror_(rc)); + else + migrate_source_post(&m); + + return rc; +} + +/** + * vu_migrate_target() - Migration as target, receive state from hypervisor + * @fd: File descriptor for state transfer + * + * Return: 0 on success, positive error code on failure + */ +static int vu_migrate_target(int fd) +{ + struct migrate_meta m; + int rc; + + rc = migrate_target_read_header(fd, &m); + if (rc) { + err("Migration header check failed: %s, abort", strerror_(rc)); + return rc; + } + + if ((rc = migrate_target_pre(&m))) { + err("Target pre-migration failed: %s, abort", strerror_(rc)); + return rc; + } + + debug("Loading backend state"); + + rc = migrate_target(fd, &m); + if (rc) + err("Target migration failed: %s", strerror_(rc)); + else + migrate_target_post(&m); + + return rc; +} + +/** + * vu_migrate() - Send/receive passt internal state to/from QEMU + * @c: Execution context * @events: epoll events */ -void vu_migrate(struct vu_dev *vdev, uint32_t events) +void vu_migrate(struct ctx *c, uint32_t events) { - int ret; + struct vu_dev *vdev = c->vdev; + int rc = EIO; - /* TODO: collect/set passt internal state - * and use vdev->device_state_fd to send/receive it - */ debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); - if (events & EPOLLOUT) { - debug("Saving backend state"); - - /* send some stuff */ - ret = write(vdev->device_state_fd, "PASST", 6); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - vdev->device_state_result = ret == -1 ? -1 : 0; - /* Closing the file descriptor signals the end of transfer */ - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, - vdev->device_state_fd, NULL); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } else if (events & EPOLLIN) { - char buf[6]; - - debug("Loading backend state"); - /* read some stuff */ - ret = read(vdev->device_state_fd, buf, sizeof(buf)); - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ - if (ret != sizeof(buf)) { - vdev->device_state_result = -1; - } else { - ret = strncmp(buf, "PASST", sizeof(buf)); - vdev->device_state_result = ret == 0 ? 0 : -1; - } - } else if (events & EPOLLHUP) { - debug("Closing migration channel"); - - /* The end of file signals the end of the transfer. */ - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, - vdev->device_state_fd, NULL); - close(vdev->device_state_fd); - vdev->device_state_fd = -1; - } + + if (events & EPOLLOUT) + rc = vu_migrate_source(vdev->device_state_fd); + else if (events & EPOLLIN) + rc = vu_migrate_target(vdev->device_state_fd); + + /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */ + + vdev->device_state_result = rc; + + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, vdev->device_state_fd, NULL); + debug("Closing migration channel"); + close(vdev->device_state_fd); + vdev->device_state_fd = -1; } diff --git a/vu_common.h b/vu_common.h index d56c021..69c4006 100644 --- a/vu_common.h +++ b/vu_common.h @@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, const struct timespec *now); int vu_send_single(const struct ctx *c, const void *buf, size_t size); -void vu_migrate(struct vu_dev *vdev, uint32_t events); +void vu_migrate(struct ctx *c, uint32_t events); #endif /* VU_COMMON_H */ -- 2.43.0