On Tue, Jan 28, 2025 at 12:15:31AM +0100, Stefano Brivio wrote: > Add two sets (source or target) of three functions each for passt in > vhost-user mode, triggered by activity on the file descriptor passed > via VHOST_USER_PROTOCOL_F_DEVICE_STATE: > > - migrate_source_pre() and migrate_target_pre() are called to prepare > for migration, before data is transferred > > - migrate_source() sends, and migrate_target() receives migration data > > - migrate_source_post() and migrate_target_post() are responsible for > any post-migration task > > Callbacks are added to these functions with arrays of function > pointers in migrate.c. Migration handlers are versioned. > > Versioned descriptions of data sections will be added to the > data_versions array, which points to versioned iovec arrays. Version > 1 is currently empty and will be filled in in subsequent patches. > > The source announces the data version to be used and informs the peer > about endianness, and the size of void *, time_t, flow entries and > flow hash table entries. > > The target checks if the version of the source is still supported. If > it's not, it aborts the migration. > > Signed-off-by: Stefano Brivio > --- > Makefile | 12 +-- > migrate.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > migrate.h | 90 ++++++++++++++++++ > passt.c | 2 +- > vu_common.c | 122 ++++++++++++++++--------- > vu_common.h | 2 +- > 6 files changed, 438 insertions(+), 49 deletions(-) > create mode 100644 migrate.c > create mode 100644 migrate.h > > diff --git a/Makefile b/Makefile > index 464eef1..1383875 100644 > --- a/Makefile > +++ b/Makefile > @@ -38,8 +38,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) > > PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ > icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ > - ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ > - tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ > + ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \ > + tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ > vhost_user.c virtio.c vu_common.c > QRAP_SRCS = qrap.c > SRCS = $(PASST_SRCS) $(QRAP_SRCS) > @@ -48,10 +48,10 @@ MANPAGES = passt.1 pasta.1 qrap.1 > > PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ > flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ > - lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ > - siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ > - tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \ > - virtio.h vu_common.h > + lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \ > + pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \ > + tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \ > + vhost_user.h virtio.h vu_common.h > HEADERS = $(PASST_HEADERS) seccomp.h > > C := \#include \nint main(){int a=getrandom(0, 0, 0);} > diff --git a/migrate.c b/migrate.c > new file mode 100644 > index 0000000..bee9653 > --- /dev/null > +++ b/migrate.c > @@ -0,0 +1,259 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > + > +/* PASST - Plug A Simple Socket Transport > + * for qemu/UNIX domain socket mode > + * > + * PASTA - Pack A Subtle Tap Abstraction > + * for network namespace/tap device mode > + * > + * migrate.c - Migration sections, layout, and routines > + * > + * Copyright (c) 2025 Red Hat GmbH > + * Author: Stefano Brivio > + */ > + > +#include > +#include > + > +#include "util.h" > +#include "ip.h" > +#include "passt.h" > +#include "inany.h" > +#include "flow.h" > +#include "flow_table.h" > + > +#include "migrate.h" > + > +/* Current version of migration data */ > +#define MIGRATE_VERSION 1 > + > +/* Magic as we see it and as seen with reverse endianness */ > +#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0 > +#define MIGRATE_MAGIC_SWAPPED 0xB0D1B10B1B1DBBB1 As noted, I'm hoping we can get rid of "either endian" migration. But if this stays, we should define it using __bswap_constant_32() to avoid embarrassing mistakes. > + > +/* Migration header to send from source */ > +static union migrate_header header = { > + .magic = MIGRATE_MAGIC, > + .version = htonl_constant(MIGRATE_VERSION), > + .time_t_size = htonl_constant(sizeof(time_t)), > + .flow_size = htonl_constant(sizeof(union flow)), > + .flow_sidx_size = htonl_constant(sizeof(struct flow_sidx)), > + .voidp_size = htonl_constant(sizeof(void *)), > +}; > + > +/* Data sections for version 1 */ > +static struct iovec sections_v1[] = { > + { &header, sizeof(header) }, > +}; > + > +/* Set of data versions */ > +static struct migrate_data data_versions[] = { > + { > + 1, sections_v1, > + }, > + { 0 }, > +}; > + > +/* Handlers to call in source before sending data */ > +struct migrate_handler handlers_source_pre[] = { > + { 0 }, > +}; > + > +/* Handlers to call in source after sending data */ > +struct migrate_handler handlers_source_post[] = { > + { 0 }, > +}; > + > +/* Handlers to call in target before receiving data with version 1 */ > +struct migrate_handler handlers_target_pre_v1[] = { > + { 0 }, > +}; > + > +/* Handlers to call in target after receiving data with version 1 */ > +struct migrate_handler handlers_target_post_v1[] = { > + { 0 }, > +}; > + > +/* Versioned sets of migration handlers */ > +struct migrate_target_handlers target_handlers[] = { > + { > + 1, > + handlers_target_pre_v1, > + handlers_target_post_v1, > + }, > + { 0 }, > +}; > + > +/** > + * migrate_source_pre() - Pre-migration tasks as source > + * @m: Migration metadata > + * > + * Return: 0 on success, error code on failure > + */ > +int migrate_source_pre(struct migrate_meta *m) > +{ > + struct migrate_handler *h; > + > + for (h = handlers_source_pre; h->fn; h++) { > + int rc; > + > + if ((rc = h->fn(m, h->data))) > + return rc; > + } > + > + return 0; > +} > + > +/** > + * migrate_source() - Perform migration as source: send state to hypervisor > + * @fd: Descriptor for state transfer > + * @m: Migration metadata > + * > + * Return: 0 on success, error code on failure > + */ > +int migrate_source(int fd, const struct migrate_meta *m) > +{ > + static struct migrate_data *d; > + unsigned count; > + int rc; > + > + for (d = data_versions; d->v != MIGRATE_VERSION; d++); Should ASSERT() if we don't find the version within the array. > + for (count = 0; d->sections[count].iov_len; count++); > + > + debug("Writing %u migration sections", count - 1 /* minus header */); > + rc = write_remainder(fd, d->sections, count, 0); > + if (rc < 0) > + return errno; > + > + return 0; > +} > + > +/** > + * migrate_source_post() - Post-migration tasks as source > + * @m: Migration metadata > + * > + * Return: 0 on success, error code on failure > + */ > +void migrate_source_post(struct migrate_meta *m) > +{ > + struct migrate_handler *h; > + > + for (h = handlers_source_post; h->fn; h++) > + h->fn(m, h->data); Is there actually anything we might need to do on the source after a successful migration, other than exit? > +} > + > +/** > + * migrate_target_read_header() - Set metadata in target from source header > + * @fd: Descriptor for state transfer > + * @m: Migration metadata, filled on return > + * > + * Return: 0 on success, error code on failure We nearly always use negative error codes. Why not here? > + */ > +int migrate_target_read_header(int fd, struct migrate_meta *m) > +{ > + static struct migrate_data *d; > + union migrate_header h; > + > + if (read_all_buf(fd, &h, sizeof(h))) > + return errno; > + > + debug("Source magic: 0x%016" PRIx64 ", sizeof(void *): %u, version: %u", > + h.magic, ntohl(h.voidp_size), ntohl(h.version)); > + > + for (d = data_versions; d->v != ntohl(h.version); d++); > + if (!d->v) > + return ENOTSUP; This is too late. The loop doesn't check it, so you've already overrun the data_versions table if the version wasn't in there. Easier to use an ARRAY_SIZE() limit in the loop, I think. > + m->v = d->v; > + > + if (h.magic == MIGRATE_MAGIC) > + m->bswap = false; > + else if (h.magic == MIGRATE_MAGIC_SWAPPED) > + m->bswap = true; > + else > + return ENOTSUP; > + > + if (ntohl(h.voidp_size) == 4) > + m->source_64b = false; > + else if (ntohl(h.voidp_size) == 8) > + m->source_64b = true; > + else > + return ENOTSUP; > + > + if (ntohl(h.time_t_size) == 4) > + m->time_64b = false; > + else if (ntohl(h.time_t_size) == 8) > + m->time_64b = true; > + else > + return ENOTSUP; > + > + m->flow_size = ntohl(h.flow_size); > + m->flow_sidx_size = ntohl(h.flow_sidx_size); > + > + return 0; > +} > + > +/** > + * migrate_target_pre() - Pre-migration tasks as target > + * @m: Migration metadata > + * > + * Return: 0 on success, error code on failure > + */ > +int migrate_target_pre(struct migrate_meta *m) > +{ > + struct migrate_target_handlers *th; > + struct migrate_handler *h; > + > + for (th = target_handlers; th->v != m->v && th->v; th++); > + > + for (h = th->pre; h->fn; h++) { > + int rc; > + > + if ((rc = h->fn(m, h->data))) > + return rc; > + } > + > + return 0; > +} > + > +/** > + * migrate_target() - Perform migration as target: receive state from hypervisor > + * @fd: Descriptor for state transfer > + * @m: Migration metadata > + * > + * Return: 0 on success, error code on failure > + * > + * #syscalls:vu readv > + */ > +int migrate_target(int fd, const struct migrate_meta *m) > +{ > + static struct migrate_data *d; > + unsigned cnt; > + int rc; > + > + for (d = data_versions; d->v != m->v && d->v; d++); > + > + for (cnt = 0; d->sections[cnt + 1 /* skip header */].iov_len; cnt++); > + > + debug("Reading %u migration sections", cnt); > + rc = read_remainder(fd, d->sections + 1, cnt, 0); > + if (rc < 0) > + return errno; > + > + return 0; > +} > + > +/** > + * migrate_target_post() - Post-migration tasks as target > + * @m: Migration metadata > + */ > +void migrate_target_post(struct migrate_meta *m) > +{ > + struct migrate_target_handlers *th; > + struct migrate_handler *h; > + > + for (th = target_handlers; th->v != m->v && th->v; th++); > + > + for (h = th->post; h->fn; h++) > + h->fn(m, h->data); > +} > diff --git a/migrate.h b/migrate.h > new file mode 100644 > index 0000000..5582f75 > --- /dev/null > +++ b/migrate.h > @@ -0,0 +1,90 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later > + * Copyright (c) 2025 Red Hat GmbH > + * Author: Stefano Brivio > + */ > + > +#ifndef MIGRATE_H > +#define MIGRATE_H > + > +/** > + * struct migrate_meta - Migration metadata > + * @v: Chosen migration data version, host order > + * @bswap: Source has opposite endianness > + * @peer_64b: Source uses 64-bit void * > + * @time_64b: Source uses 64-bit time_t > + * @flow_size: Size of union flow in source > + * @flow_sidx_size: Size of struct flow_sidx in source > + */ > +struct migrate_meta { > + uint32_t v; > + bool bswap; > + bool source_64b; > + bool time_64b; > + size_t flow_size; > + size_t flow_sidx_size; > +}; > + > +/** > + * union migrate_header - Migration header from source > + * @magic: 0xB1BB1D1B0BB1D1B0, host order > + * @version: Source sends highest known, target aborts if unsupported > + * @voidp_size: sizeof(void *), network order > + * @time_t_size: sizeof(time_t), network order > + * @flow_size: sizeof(union flow), network order > + * @flow_sidx_size: sizeof(struct flow_sidx_t), network order > + * @unused: Go figure > + */ > +union migrate_header { > + struct { > + uint64_t magic; > + uint32_t version; > + uint32_t voidp_size; > + uint32_t time_t_size; > + uint32_t flow_size; > + uint32_t flow_sidx_size; > + }; > + uint8_t unused[65536]; So, having looked at this, I no longer think padding the header to 64kiB is a good idea. The structure means we're basically stuck always having that chunky header. Instead, I think the header should be absolutely minimal: basically magic and version only. v1 (and maybe others) can add a "metadata" or whatever section for additional information like this they need. > +}; > + > +/** > + * struct migrate_data - Data sections for given source version > + * @v: Source version this applies to, host order > + * @sections: Array of data sections, NULL-terminated > + */ > +struct migrate_data { > + uint32_t v; > + struct iovec *sections; > +}; > + > +/** > + * struct migrate_handler - Function to handle a specific data section > + * @fn: Function pointer taking pointer to data section > + * @data: Associated data section > + */ > +struct migrate_handler { > + int (*fn)(struct migrate_meta *m, void *data); > + void *data; > +}; > + > +/** > + * struct migrate_target_handlers - Versioned sets of migration target handlers > + * @v: Source version this applies to, host order > + * @pre: Set of functions to execute in target before data copy > + * @post: Set of functions to execute in target after data copy > + */ > +struct migrate_target_handlers { > + uint32_t v; > + struct migrate_handler *pre; > + struct migrate_handler *post; > +}; > + > +int migrate_source_pre(struct migrate_meta *m); > +int migrate_source(int fd, const struct migrate_meta *m); > +void migrate_source_post(struct migrate_meta *m); > + > +int migrate_target_read_header(int fd, struct migrate_meta *m); > +int migrate_target_pre(struct migrate_meta *m); > +int migrate_target(int fd, const struct migrate_meta *m); > +void migrate_target_post(struct migrate_meta *m); > + > +#endif /* MIGRATE_H */ > diff --git a/passt.c b/passt.c > index b1c8ab6..184d4e5 100644 > --- a/passt.c > +++ b/passt.c > @@ -358,7 +358,7 @@ loop: > vu_kick_cb(c.vdev, ref, &now); > break; > case EPOLL_TYPE_VHOST_MIGRATION: > - vu_migrate(c.vdev, eventmask); > + vu_migrate(&c, eventmask); > break; > default: > /* Can't happen */ > diff --git a/vu_common.c b/vu_common.c > index f43d8ac..0c67bd0 100644 > --- a/vu_common.c > +++ b/vu_common.c > @@ -5,6 +5,7 @@ > * common_vu.c - vhost-user common UDP and TCP functions > */ > > +#include > #include > #include > #include > @@ -17,6 +18,7 @@ > #include "vhost_user.h" > #include "pcap.h" > #include "vu_common.h" > +#include "migrate.h" > > #define VU_MAX_TX_BUFFER_NB 2 > > @@ -305,50 +307,88 @@ err: > } > > /** > - * vu_migrate() - Send/receive passt insternal state to/from QEMU > - * @vdev: vhost-user device > + * vu_migrate_source() - Migration as source, send state to hypervisor > + * @fd: File descriptor for state transfer > + * > + * Return: 0 on success, positive error code on failure > + */ > +static int vu_migrate_source(int fd) > +{ > + struct migrate_meta m; > + int rc; > + > + if ((rc = migrate_source_pre(&m))) { > + err("Source pre-migration failed: %s, abort", strerror_(rc)); > + return rc; > + } > + > + debug("Saving backend state"); > + > + rc = migrate_source(fd, &m); > + if (rc) > + err("Source migration failed: %s", strerror_(rc)); > + else > + migrate_source_post(&m); > + > + return rc; After a successful source migration shouldn't we exit, or at least quiesce ourselves so we don't accidentally mess with anything the target is now doing? > +} > + > +/** > + * vu_migrate_target() - Migration as target, receive state from hypervisor > + * @fd: File descriptor for state transfer > + * > + * Return: 0 on success, positive error code on failure > + */ > +static int vu_migrate_target(int fd) > +{ > + struct migrate_meta m; > + int rc; > + > + rc = migrate_target_read_header(fd, &m); > + if (rc) { > + err("Migration header check failed: %s, abort", strerror_(rc)); > + return rc; > + } > + > + if ((rc = migrate_target_pre(&m))) { > + err("Target pre-migration failed: %s, abort", strerror_(rc)); > + return rc; > + } > + > + debug("Loading backend state"); > + > + rc = migrate_target(fd, &m); > + if (rc) > + err("Target migration failed: %s", strerror_(rc)); > + else > + migrate_target_post(&m); > + > + return rc; > +} > + > +/** > + * vu_migrate() - Send/receive passt internal state to/from QEMU > + * @c: Execution context > * @events: epoll events > */ > -void vu_migrate(struct vu_dev *vdev, uint32_t events) > +void vu_migrate(struct ctx *c, uint32_t events) > { > - int ret; > + struct vu_dev *vdev = c->vdev; > + int rc = EIO; > > - /* TODO: collect/set passt internal state > - * and use vdev->device_state_fd to send/receive it > - */ > debug("vu_migrate fd %d events %x", vdev->device_state_fd, events); > - if (events & EPOLLOUT) { > - debug("Saving backend state"); > - > - /* send some stuff */ > - ret = write(vdev->device_state_fd, "PASST", 6); > - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ > - vdev->device_state_result = ret == -1 ? -1 : 0; > - /* Closing the file descriptor signals the end of transfer */ > - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, > - vdev->device_state_fd, NULL); > - close(vdev->device_state_fd); > - vdev->device_state_fd = -1; > - } else if (events & EPOLLIN) { > - char buf[6]; > - > - debug("Loading backend state"); > - /* read some stuff */ > - ret = read(vdev->device_state_fd, buf, sizeof(buf)); > - /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */ > - if (ret != sizeof(buf)) { > - vdev->device_state_result = -1; > - } else { > - ret = strncmp(buf, "PASST", sizeof(buf)); > - vdev->device_state_result = ret == 0 ? 0 : -1; > - } > - } else if (events & EPOLLHUP) { > - debug("Closing migration channel"); > - > - /* The end of file signals the end of the transfer. */ > - epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, > - vdev->device_state_fd, NULL); > - close(vdev->device_state_fd); > - vdev->device_state_fd = -1; > - } > + > + if (events & EPOLLOUT) > + rc = vu_migrate_source(vdev->device_state_fd); > + else if (events & EPOLLIN) > + rc = vu_migrate_target(vdev->device_state_fd); > + > + /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */ > + > + vdev->device_state_result = rc; > + > + epoll_ctl(c->epollfd, EPOLL_CTL_DEL, vdev->device_state_fd, NULL); > + debug("Closing migration channel"); > + close(vdev->device_state_fd); > + vdev->device_state_fd = -1; > } > diff --git a/vu_common.h b/vu_common.h > index d56c021..69c4006 100644 > --- a/vu_common.h > +++ b/vu_common.h > @@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq, > void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, > const struct timespec *now); > int vu_send_single(const struct ctx *c, const void *buf, size_t size); > -void vu_migrate(struct vu_dev *vdev, uint32_t events); > +void vu_migrate(struct ctx *c, uint32_t events); > #endif /* VU_COMMON_H */ -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson