* [PATCH v3 01/20] tcp: Always pass NULL event with EPOLL_CTL_DEL
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 02/20] util: Rename and make global vu_remove_watch() Stefano Brivio
` (18 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
From: David Gibson <david@gibson.dropbear.id.au>
In tcp_epoll_ctl() we pass an event pointer with EPOLL_CTL_DEL, even though
it will be ignored. It's possible this was a workaround for pre-2.6.9
kernels which required a non-NULL pointer here, but we rely on the kernel
accepting NULL events for EPOLL_CTL_DEL in lots of other places. Use
NULL instead for simplicity and consistency.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
tcp.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tcp.c b/tcp.c
index c89f323..4eed82b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -468,9 +468,9 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
if (conn->events == CLOSED) {
if (conn->in_epoll)
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL);
if (conn->timer != -1)
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, NULL);
return 0;
}
--
@@ -468,9 +468,9 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
if (conn->events == CLOSED) {
if (conn->in_epoll)
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, &ev);
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL);
if (conn->timer != -1)
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, &ev);
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, NULL);
return 0;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 02/20] util: Rename and make global vu_remove_watch()
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 01/20] tcp: Always pass NULL event with EPOLL_CTL_DEL Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 03/20] icmp, udp: Pad time_t timestamp to 64-bit to ease state migration Stefano Brivio
` (17 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
From: David Gibson <david@gibson.dropbear.id.au>
vu_remove_watch() is used in vhost_user.c to remove an fd from the global
epoll set. There's nothing really vhost user specific about it though,
so rename, move to util.c and use it in a bunch of places outside
vhost_user.c where it makes things marginally more readable.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
icmp.c | 2 +-
tap.c | 2 +-
tcp.c | 4 ++--
tcp_splice.c | 4 ++--
udp_flow.c | 2 +-
util.c | 10 ++++++++++
util.h | 1 +
vhost_user.c | 21 +++++----------------
vu_common.c | 6 ++----
9 files changed, 25 insertions(+), 27 deletions(-)
diff --git a/icmp.c b/icmp.c
index 143e93b..bcf498d 100644
--- a/icmp.c
+++ b/icmp.c
@@ -150,7 +150,7 @@ unexpected:
static void icmp_ping_close(const struct ctx *c,
const struct icmp_ping_flow *pingf)
{
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL);
+ epoll_del(c, pingf->sock);
close(pingf->sock);
flow_hash_remove(c, FLOW_SIDX(pingf, INISIDE));
}
diff --git a/tap.c b/tap.c
index cd32a90..772648f 100644
--- a/tap.c
+++ b/tap.c
@@ -1005,7 +1005,7 @@ void tap_sock_reset(struct ctx *c)
exit(EXIT_SUCCESS);
/* Close the connected socket, wait for a new connection */
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL);
+ epoll_del(c, c->fd_tap);
close(c->fd_tap);
c->fd_tap = -1;
if (c->mode == MODE_VU)
diff --git a/tcp.c b/tcp.c
index 4eed82b..7787381 100644
--- a/tcp.c
+++ b/tcp.c
@@ -468,9 +468,9 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
if (conn->events == CLOSED) {
if (conn->in_epoll)
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->sock, NULL);
+ epoll_del(c, conn->sock);
if (conn->timer != -1)
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->timer, NULL);
+ epoll_del(c, conn->timer);
return 0;
}
diff --git a/tcp_splice.c b/tcp_splice.c
index 3a000ff..5db1d62 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -200,8 +200,8 @@ static void conn_flag_do(const struct ctx *c, struct tcp_splice_conn *conn,
}
if (flag == CLOSING) {
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[0], NULL);
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, conn->s[1], NULL);
+ epoll_del(c, conn->s[0]);
+ epoll_del(c, conn->s[1]);
}
}
diff --git a/udp_flow.c b/udp_flow.c
index 9fd7d06..7fae81d 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -52,7 +52,7 @@ void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
if (uflow->s[TGTSIDE] >= 0) {
/* But the flow specific one needs to be removed */
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, uflow->s[TGTSIDE], NULL);
+ epoll_del(c, uflow->s[TGTSIDE]);
close(uflow->s[TGTSIDE]);
uflow->s[TGTSIDE] = -1;
}
diff --git a/util.c b/util.c
index 11973c4..c7b09f0 100644
--- a/util.c
+++ b/util.c
@@ -837,3 +837,13 @@ void raw_random(void *buf, size_t buflen)
if (random_read < buflen)
die("Unexpected EOF on random data source");
}
+
+/**
+ * epoll_del() - Remove a file descriptor from our passt epoll
+ * @c: Execution context
+ * @fd: File descriptor to remove
+ */
+void epoll_del(const struct ctx *c, int fd)
+{
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, fd, NULL);
+}
diff --git a/util.h b/util.h
index d02333d..800a28b 100644
--- a/util.h
+++ b/util.h
@@ -276,6 +276,7 @@ static inline bool mod_between(unsigned x, unsigned i, unsigned j, unsigned m)
#define FPRINTF(f, ...) (void)fprintf(f, __VA_ARGS__)
void raw_random(void *buf, size_t buflen);
+void epoll_del(const struct ctx *c, int fd);
/*
* Starting from glibc 2.40.9000 and commit 25a5eb4010df ("string: strerror,
diff --git a/vhost_user.c b/vhost_user.c
index 6bf0dda..bbbf504 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -162,17 +162,6 @@ static void vmsg_close_fds(const struct vhost_user_msg *vmsg)
close(vmsg->fds[i]);
}
-/**
- * vu_remove_watch() - Remove a file descriptor from our passt epoll
- * file descriptor
- * @vdev: vhost-user device
- * @fd: file descriptor to remove
- */
-static void vu_remove_watch(const struct vu_dev *vdev, int fd)
-{
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, fd, NULL);
-}
-
/**
* vmsg_set_reply_u64() - Set reply payload.u64 and clear request flags
* and fd_num
@@ -748,7 +737,7 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev,
vdev->vq[idx].call_fd = -1;
}
if (vdev->vq[idx].kick_fd != -1) {
- vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
+ epoll_del(vdev->context, vdev->vq[idx].kick_fd);
close(vdev->vq[idx].kick_fd);
vdev->vq[idx].kick_fd = -1;
}
@@ -816,7 +805,7 @@ static bool vu_set_vring_kick_exec(struct vu_dev *vdev,
vu_check_queue_msg_file(msg);
if (vdev->vq[idx].kick_fd != -1) {
- vu_remove_watch(vdev, vdev->vq[idx].kick_fd);
+ epoll_del(vdev->context, vdev->vq[idx].kick_fd);
close(vdev->vq[idx].kick_fd);
vdev->vq[idx].kick_fd = -1;
}
@@ -1063,7 +1052,7 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
die("Invalide device_state_fd direction: %d", direction);
if (vdev->device_state_fd != -1) {
- vu_remove_watch(vdev, vdev->device_state_fd);
+ epoll_del(vdev->context, vdev->device_state_fd);
close(vdev->device_state_fd);
}
@@ -1145,7 +1134,7 @@ void vu_cleanup(struct vu_dev *vdev)
vq->err_fd = -1;
}
if (vq->kick_fd != -1) {
- vu_remove_watch(vdev, vq->kick_fd);
+ epoll_del(vdev->context, vq->kick_fd);
close(vq->kick_fd);
vq->kick_fd = -1;
}
@@ -1169,7 +1158,7 @@ void vu_cleanup(struct vu_dev *vdev)
vu_close_log(vdev);
if (vdev->device_state_fd != -1) {
- vu_remove_watch(vdev, vdev->device_state_fd);
+ epoll_del(vdev->context, vdev->device_state_fd);
close(vdev->device_state_fd);
vdev->device_state_fd = -1;
vdev->device_state_result = -1;
diff --git a/vu_common.c b/vu_common.c
index f43d8ac..2c12dca 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -325,8 +325,7 @@ void vu_migrate(struct vu_dev *vdev, uint32_t events)
/* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
vdev->device_state_result = ret == -1 ? -1 : 0;
/* Closing the file descriptor signals the end of transfer */
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
- vdev->device_state_fd, NULL);
+ epoll_del(vdev->context, vdev->device_state_fd);
close(vdev->device_state_fd);
vdev->device_state_fd = -1;
} else if (events & EPOLLIN) {
@@ -346,8 +345,7 @@ void vu_migrate(struct vu_dev *vdev, uint32_t events)
debug("Closing migration channel");
/* The end of file signals the end of the transfer. */
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
- vdev->device_state_fd, NULL);
+ epoll_del(vdev->context, vdev->device_state_fd);
close(vdev->device_state_fd);
vdev->device_state_fd = -1;
}
--
@@ -325,8 +325,7 @@ void vu_migrate(struct vu_dev *vdev, uint32_t events)
/* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
vdev->device_state_result = ret == -1 ? -1 : 0;
/* Closing the file descriptor signals the end of transfer */
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
- vdev->device_state_fd, NULL);
+ epoll_del(vdev->context, vdev->device_state_fd);
close(vdev->device_state_fd);
vdev->device_state_fd = -1;
} else if (events & EPOLLIN) {
@@ -346,8 +345,7 @@ void vu_migrate(struct vu_dev *vdev, uint32_t events)
debug("Closing migration channel");
/* The end of file signals the end of the transfer. */
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL,
- vdev->device_state_fd, NULL);
+ epoll_del(vdev->context, vdev->device_state_fd);
close(vdev->device_state_fd);
vdev->device_state_fd = -1;
}
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 03/20] icmp, udp: Pad time_t timestamp to 64-bit to ease state migration
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 01/20] tcp: Always pass NULL event with EPOLL_CTL_DEL Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 02/20] util: Rename and make global vu_remove_watch() Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 04/20] flow, flow_table: Pad flow table entries to 128 bytes, hash entries to 32 bits Stefano Brivio
` (16 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
That's the only field in flows with different storage sizes depending
on the architecture: it's usually 4-byte wide on 32-bit architectures,
except for arc and x32 where it's 8 bytes, and 8-byte wide on 64-bit
machines.
By keeping flow entries the same size across architectures, we avoid
having to expand or shrink table entries upon migration.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
icmp_flow.h | 6 +++++-
udp_flow.h | 6 +++++-
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/icmp_flow.h b/icmp_flow.h
index fb93801..da7e255 100644
--- a/icmp_flow.h
+++ b/icmp_flow.h
@@ -13,6 +13,7 @@
* @seq: Last sequence number sent to tap, host order, -1: not sent yet
* @sock: "ping" socket
* @ts: Last associated activity from tap, seconds
+ * @ts_storage: Pad @ts to 64-bit storage to keep state migration sane
*/
struct icmp_ping_flow {
/* Must be first element */
@@ -20,7 +21,10 @@ struct icmp_ping_flow {
int seq;
int sock;
- time_t ts;
+ union {
+ time_t ts;
+ uint64_t ts_storage;
+ };
};
bool icmp_ping_timer(const struct ctx *c, const struct icmp_ping_flow *pingf,
diff --git a/udp_flow.h b/udp_flow.h
index 9a1b059..9cb79a0 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -12,6 +12,7 @@
* @f: Generic flow information
* @closed: Flow is already closed
* @ts: Activity timestamp
+ * @ts_storage: Pad @ts to 64-bit storage to keep state migration sane
* @s: Socket fd (or -1) for each side of the flow
*/
struct udp_flow {
@@ -19,7 +20,10 @@ struct udp_flow {
struct flow_common f;
bool closed :1;
- time_t ts;
+ union {
+ time_t ts;
+ uint64_t ts_storage;
+ };
int s[SIDES];
};
--
@@ -12,6 +12,7 @@
* @f: Generic flow information
* @closed: Flow is already closed
* @ts: Activity timestamp
+ * @ts_storage: Pad @ts to 64-bit storage to keep state migration sane
* @s: Socket fd (or -1) for each side of the flow
*/
struct udp_flow {
@@ -19,7 +20,10 @@ struct udp_flow {
struct flow_common f;
bool closed :1;
- time_t ts;
+ union {
+ time_t ts;
+ uint64_t ts_storage;
+ };
int s[SIDES];
};
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 04/20] flow, flow_table: Pad flow table entries to 128 bytes, hash entries to 32 bits
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (2 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 03/20] icmp, udp: Pad time_t timestamp to 64-bit to ease state migration Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 05/20] flow_table: Use size in extern declaration for flowtab Stefano Brivio
` (15 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
...to keep migration sane. Right now, the biggest struct in union flow
is struct tcp_splice_conn with 120 bytes on x86_64, which should also
have the biggest storage and alignment requirements of any
architecture we might run on.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
flow.h | 18 ++++++++++++------
flow_table.h | 13 ++++++++++---
2 files changed, 22 insertions(+), 9 deletions(-)
diff --git a/flow.h b/flow.h
index 24ba3ef..8eb5964 100644
--- a/flow.h
+++ b/flow.h
@@ -202,15 +202,21 @@ struct flow_common {
/**
* struct flow_sidx - ID for one side of a specific flow
- * @sidei: Index of side referenced (0 or 1)
- * @flowi: Index of flow referenced
+ * @sidei: Index of side referenced (0 or 1)
+ * @flowi: Index of flow referenced
+ * @flow_sidx_storage: Pad to 32 bits
*/
typedef struct flow_sidx {
- unsigned sidei :1;
- unsigned flowi :FLOW_INDEX_BITS;
+ union {
+ struct {
+ unsigned sidei :1;
+ unsigned flowi :FLOW_INDEX_BITS;
+ };
+ uint32_t flow_sidx_storage;
+ };
} flow_sidx_t;
-static_assert(sizeof(flow_sidx_t) <= sizeof(uint32_t),
- "flow_sidx_t must fit within 32 bits");
+static_assert(sizeof(flow_sidx_t) == sizeof(uint32_t),
+ "flow_sidx_t must be 32-bit wide");
#define FLOW_SIDX_NONE ((flow_sidx_t){ .flowi = FLOW_MAX })
diff --git a/flow_table.h b/flow_table.h
index f15db53..007f4dd 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -26,9 +26,13 @@ struct flow_free_cluster {
/**
* union flow - Descriptor for a logical packet flow (e.g. connection)
- * @f: Fields common between all variants
- * @tcp: Fields for non-spliced TCP connections
- * @tcp_splice: Fields for spliced TCP connections
+ * @f: Fields common between all variants
+ * @free: Entry in a cluster of free entries
+ * @tcp: Fields for non-spliced TCP connections
+ * @tcp_splice: Fields for spliced TCP connections
+ * @ping: Tracking for ping flows
+ * @udp: Tracking for UDP flows
+ * @flow_storage: Pad flow entries to 128 bytes to ease state migration
*/
union flow {
struct flow_common f;
@@ -37,8 +41,11 @@ union flow {
struct tcp_splice_conn tcp_splice;
struct icmp_ping_flow ping;
struct udp_flow udp;
+ char flow_storage[128];
};
+static_assert(sizeof(union flow) == 128, "union flow should be 128-byte wide");
+
/* Global Flow Table */
extern unsigned flow_first_free;
extern union flow flowtab[];
--
@@ -26,9 +26,13 @@ struct flow_free_cluster {
/**
* union flow - Descriptor for a logical packet flow (e.g. connection)
- * @f: Fields common between all variants
- * @tcp: Fields for non-spliced TCP connections
- * @tcp_splice: Fields for spliced TCP connections
+ * @f: Fields common between all variants
+ * @free: Entry in a cluster of free entries
+ * @tcp: Fields for non-spliced TCP connections
+ * @tcp_splice: Fields for spliced TCP connections
+ * @ping: Tracking for ping flows
+ * @udp: Tracking for UDP flows
+ * @flow_storage: Pad flow entries to 128 bytes to ease state migration
*/
union flow {
struct flow_common f;
@@ -37,8 +41,11 @@ union flow {
struct tcp_splice_conn tcp_splice;
struct icmp_ping_flow ping;
struct udp_flow udp;
+ char flow_storage[128];
};
+static_assert(sizeof(union flow) == 128, "union flow should be 128-byte wide");
+
/* Global Flow Table */
extern unsigned flow_first_free;
extern union flow flowtab[];
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 05/20] flow_table: Use size in extern declaration for flowtab
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (3 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 04/20] flow, flow_table: Pad flow table entries to 128 bytes, hash entries to 32 bits Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 06/20] util: Add read_remainder() and read_all_buf() Stefano Brivio
` (14 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
...so that we can use sizeof() on it.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
flow_table.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/flow_table.h b/flow_table.h
index 007f4dd..a85cab5 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -48,7 +48,7 @@ static_assert(sizeof(union flow) == 128, "union flow should be 128-byte wide");
/* Global Flow Table */
extern unsigned flow_first_free;
-extern union flow flowtab[];
+extern union flow flowtab[FLOW_MAX];
/**
* flow_foreach_sidei() - 'for' type macro to step through each side of flow
--
@@ -48,7 +48,7 @@ static_assert(sizeof(union flow) == 128, "union flow should be 128-byte wide");
/* Global Flow Table */
extern unsigned flow_first_free;
-extern union flow flowtab[];
+extern union flow flowtab[FLOW_MAX];
/**
* flow_foreach_sidei() - 'for' type macro to step through each side of flow
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 06/20] util: Add read_remainder() and read_all_buf()
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (4 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 05/20] flow_table: Use size in extern declaration for flowtab Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 07/20] Introduce facilities for guest migration on top of vhost-user infrastructure Stefano Brivio
` (13 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
These are symmetric to write_remainder() and write_all_buf() and
almost a copy and paste of them, with the most notable differences
being reversed reads/writes and a couple of better-safe-than-sorry
asserts to keep Coverity happy.
I'll use them in the next patch. At least for the moment, they're
going to be used for vhost-user mode only, so I'm not unconditionally
enabling readv() in the seccomp profile: the caller has to ensure it's
there.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
util.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
util.h | 2 ++
2 files changed, 85 insertions(+)
diff --git a/util.c b/util.c
index c7b09f0..f2eef93 100644
--- a/util.c
+++ b/util.c
@@ -606,6 +606,89 @@ int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip)
return 0;
}
+/**
+ * read_all_buf() - Fill a whole buffer from a file descriptor
+ * @fd: File descriptor
+ * @buf: Pointer to base of buffer
+ * @len: Length of buffer
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * #syscalls read
+ */
+int read_all_buf(int fd, void *buf, size_t len)
+{
+ size_t left = len;
+ char *p = buf;
+
+ while (left) {
+ ssize_t rc;
+
+ ASSERT(left <= len);
+
+ do
+ rc = read(fd, p, left);
+ while ((rc < 0) && errno == EINTR);
+
+ if (rc < 0)
+ return -1;
+
+ if (rc == 0) {
+ errno = ENODATA;
+ return -1;
+ }
+
+ p += rc;
+ left -= rc;
+ }
+ return 0;
+}
+
+/**
+ * read_remainder() - Read the tail of an IO vector from a file descriptor
+ * @fd: File descriptor
+ * @iov: IO vector
+ * @cnt: Number of entries in @iov
+ * @skip: Number of bytes of the vector to skip reading
+ *
+ * Return: 0 on success, -1 on error (with errno set)
+ *
+ * Note: mode-specific seccomp profiles need to enable readv() to use this.
+ */
+int read_remainder(int fd, struct iovec *iov, size_t cnt, size_t skip)
+{
+ size_t i = 0, offset;
+
+ while ((i += iov_skip_bytes(iov + i, cnt - i, skip, &offset)) < cnt) {
+ ssize_t rc;
+
+ if (offset) {
+ ASSERT(offset < iov[i].iov_len);
+ /* Read the remainder of the partially read buffer */
+ if (read_all_buf(fd, (char *)iov[i].iov_base + offset,
+ iov[i].iov_len - offset) < 0)
+ return -1;
+ i++;
+ }
+
+ if (cnt == i)
+ break;
+
+ /* Fill as many of the remaining buffers as we can */
+ rc = readv(fd, &iov[i], cnt - i);
+ if (rc < 0)
+ return -1;
+
+ if (rc == 0) {
+ errno = ENODATA;
+ return -1;
+ }
+
+ skip = rc;
+ }
+ return 0;
+}
+
/** sockaddr_ntop() - Convert a socket address to text format
* @sa: Socket address
* @dst: output buffer, minimum SOCKADDR_STRLEN bytes
diff --git a/util.h b/util.h
index 800a28b..6ae8588 100644
--- a/util.h
+++ b/util.h
@@ -203,6 +203,8 @@ int fls(unsigned long x);
int write_file(const char *path, const char *buf);
int write_all_buf(int fd, const void *buf, size_t len);
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
+int read_all_buf(int fd, void *buf, size_t len);
+int read_remainder(int fd, struct iovec *iov, size_t cnt, size_t skip);
void close_open_files(int argc, char **argv);
bool snprintf_check(char *str, size_t size, const char *format, ...);
--
@@ -203,6 +203,8 @@ int fls(unsigned long x);
int write_file(const char *path, const char *buf);
int write_all_buf(int fd, const void *buf, size_t len);
int write_remainder(int fd, const struct iovec *iov, size_t iovcnt, size_t skip);
+int read_all_buf(int fd, void *buf, size_t len);
+int read_remainder(int fd, struct iovec *iov, size_t cnt, size_t skip);
void close_open_files(int argc, char **argv);
bool snprintf_check(char *str, size_t size, const char *format, ...);
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 07/20] Introduce facilities for guest migration on top of vhost-user infrastructure
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (5 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 06/20] util: Add read_remainder() and read_all_buf() Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 08/20] Introduce passt-repair Stefano Brivio
` (12 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
Add two sets (source or target) of three functions each for passt in
vhost-user mode, triggered by activity on the file descriptor passed
via VHOST_USER_PROTOCOL_F_DEVICE_STATE:
- migrate_source_pre() and migrate_target_pre() are called to prepare
for migration, before data is transferred
- migrate_source() sends, and migrate_target() receives migration data
- migrate_source_post() and migrate_target_post() are responsible for
any post-migration task
Callbacks are added to these functions with arrays of function
pointers in migrate.c. Migration handlers are versioned.
Versioned descriptions of data sections will be added to the
data_versions array, which points to versioned iovec arrays. Version
1 is currently empty and will be filled in in subsequent patches.
The source announces the data version to be used and informs the peer
about endianness, and the size of void *, time_t, flow entries and
flow hash table entries.
The target checks if the version of the source is still supported. If
it's not, it aborts the migration.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
Makefile | 12 +--
migrate.c | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++
migrate.h | 88 +++++++++++++++++
passt.c | 2 +-
vu_common.c | 120 +++++++++++++++--------
vu_common.h | 2 +-
6 files changed, 448 insertions(+), 46 deletions(-)
create mode 100644 migrate.c
create mode 100644 migrate.h
diff --git a/Makefile b/Makefile
index 464eef1..1383875 100644
--- a/Makefile
+++ b/Makefile
@@ -38,8 +38,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
- ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
- tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
+ ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \
+ tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
vhost_user.c virtio.c vu_common.c
QRAP_SRCS = qrap.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
@@ -48,10 +48,10 @@ MANPAGES = passt.1 pasta.1 qrap.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
- lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
- siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
- tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \
- virtio.h vu_common.h
+ lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
+ pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \
+ tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \
+ vhost_user.h virtio.h vu_common.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/migrate.c b/migrate.c
new file mode 100644
index 0000000..9ddac8f
--- /dev/null
+++ b/migrate.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * migrate.c - Migration sections, layout, and routines
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <errno.h>
+#include <sys/uio.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "inany.h"
+#include "flow.h"
+#include "flow_table.h"
+
+#include "migrate.h"
+
+/* Current version of migration data */
+#define MIGRATE_VERSION 1
+
+/* Magic as we see it and as seen with reverse endianness */
+#define MIGRATE_MAGIC 0xB1BB1D1B0BB1D1B0
+#define MIGRATE_MAGIC_SWAPPED 0xB0D1B1B01B1DBBB1
+
+/* Migration header to send from source */
+static union migrate_header header = {
+ /* Immutable part of header structure: keep these two sections at the
+ * beginning, because they are enough to identify a version regardless
+ * of metadata.
+ */
+ .magic = MIGRATE_MAGIC,
+ .version = htonl_constant(MIGRATE_VERSION),
+ /* End of immutable part of header structure */
+
+ .time_t_size = htonl_constant(sizeof(time_t)),
+ .flow_size = htonl_constant(sizeof(union flow)),
+ .flow_sidx_size = htonl_constant(sizeof(struct flow_sidx)),
+ .voidp_size = htonl_constant(sizeof(void *)),
+};
+
+/* Data sections for version 1 */
+static struct iovec sections_v1[] = {
+ { &header, sizeof(header) },
+};
+
+/* Set of data versions */
+static struct migrate_data data_versions[] = {
+ {
+ 1, sections_v1,
+ },
+ { 0 },
+};
+
+/* Handlers to call in source before sending data */
+struct migrate_handler handlers_source_pre[] = {
+ { 0 },
+};
+
+/* Handlers to call in source after sending data */
+struct migrate_handler handlers_source_post[] = {
+ { 0 },
+};
+
+/* Handlers to call in target before receiving data with version 1 */
+struct migrate_handler handlers_target_pre_v1[] = {
+ { 0 },
+};
+
+/* Handlers to call in target after receiving data with version 1 */
+struct migrate_handler handlers_target_post_v1[] = {
+ { 0 },
+};
+
+/* Versioned sets of migration handlers */
+struct migrate_target_handlers target_handlers[] = {
+ {
+ 1,
+ handlers_target_pre_v1,
+ handlers_target_post_v1,
+ },
+ { 0 },
+};
+
+/**
+ * migrate_source_pre() - Pre-migration tasks as source
+ * @c: Execution context
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ */
+int migrate_source_pre(struct ctx *c, struct migrate_meta *m)
+{
+ struct migrate_handler *h;
+
+ for (h = handlers_source_pre; h->fn; h++) {
+ int rc;
+
+ if ((rc = h->fn(c, m)))
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * migrate_source() - Perform migration as source: send state to hypervisor
+ * @fd: Descriptor for state transfer
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ */
+int migrate_source(int fd, const struct migrate_meta *m)
+{
+ static struct migrate_data *d;
+ int count, rc;
+
+ (void)m;
+
+ for (d = data_versions; d->v != MIGRATE_VERSION; d++);
+
+ for (count = 0; d->sections[count].iov_len; count++);
+
+ debug("Writing %u migration sections", count - 1 /* minus header */);
+ rc = write_remainder(fd, d->sections, count, 0);
+ if (rc < 0)
+ return errno;
+
+ return 0;
+}
+
+/**
+ * migrate_source_post() - Post-migration tasks as source
+ * @c: Execution context
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ */
+void migrate_source_post(struct ctx *c, struct migrate_meta *m)
+{
+ struct migrate_handler *h;
+
+ for (h = handlers_source_post; h->fn; h++)
+ h->fn(c, m);
+}
+
+/**
+ * migrate_target_read_header() - Set metadata in target from source header
+ * @fd: Descriptor for state transfer
+ * @m: Migration metadata, filled on return
+ *
+ * Return: 0 on success, error code on failure
+ */
+int migrate_target_read_header(int fd, struct migrate_meta *m)
+{
+ static struct migrate_data *d;
+ union migrate_header h;
+
+ if (read_all_buf(fd, &h, sizeof(h)))
+ return errno;
+
+ debug("Source magic: 0x%016" PRIx64 ", sizeof(void *): %u, version: %u",
+ h.magic, ntohl(h.voidp_size), ntohl(h.version));
+
+ for (d = data_versions; d->v != ntohl(h.version) && d->v; d++);
+ if (!d->v)
+ return ENOTSUP;
+ m->v = d->v;
+
+ if (h.magic == MIGRATE_MAGIC)
+ m->bswap = false;
+ else if (h.magic == MIGRATE_MAGIC_SWAPPED)
+ m->bswap = true;
+ else
+ return ENOTSUP;
+
+ if (ntohl(h.voidp_size) == 4)
+ m->source_64b = false;
+ else if (ntohl(h.voidp_size) == 8)
+ m->source_64b = true;
+ else
+ return ENOTSUP;
+
+ if (ntohl(h.time_t_size) == 4)
+ m->time_64b = false;
+ else if (ntohl(h.time_t_size) == 8)
+ m->time_64b = true;
+ else
+ return ENOTSUP;
+
+ m->flow_size = ntohl(h.flow_size);
+ m->flow_sidx_size = ntohl(h.flow_sidx_size);
+
+ return 0;
+}
+
+/**
+ * migrate_target_pre() - Pre-migration tasks as target
+ * @c: Execution context
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ */
+int migrate_target_pre(struct ctx *c, struct migrate_meta *m)
+{
+ struct migrate_target_handlers *th;
+ struct migrate_handler *h;
+
+ for (th = target_handlers; th->v != m->v && th->v; th++);
+
+ for (h = th->pre; h->fn; h++) {
+ int rc;
+
+ if ((rc = h->fn(c, m)))
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * migrate_target() - Perform migration as target: receive state from hypervisor
+ * @fd: Descriptor for state transfer
+ * @m: Migration metadata
+ *
+ * Return: 0 on success, error code on failure
+ *
+ * #syscalls:vu readv
+ */
+int migrate_target(int fd, const struct migrate_meta *m)
+{
+ static struct migrate_data *d;
+ unsigned cnt;
+ int rc;
+
+ for (d = data_versions; d->v != m->v && d->v; d++);
+
+ for (cnt = 0; d->sections[cnt + 1 /* skip header */].iov_len; cnt++);
+
+ debug("Reading %u migration sections", cnt);
+ rc = read_remainder(fd, d->sections + 1, cnt, 0);
+ if (rc < 0)
+ return errno;
+
+ return 0;
+}
+
+/**
+ * migrate_target_post() - Post-migration tasks as target
+ * @c: Execution context
+ * @m: Migration metadata
+ */
+void migrate_target_post(struct ctx *c, struct migrate_meta *m)
+{
+ struct migrate_target_handlers *th;
+ struct migrate_handler *h;
+
+ for (th = target_handlers; th->v != m->v && th->v; th++);
+
+ for (h = th->post; h->fn; h++)
+ h->fn(c, m);
+}
diff --git a/migrate.h b/migrate.h
new file mode 100644
index 0000000..9a68f17
--- /dev/null
+++ b/migrate.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef MIGRATE_H
+#define MIGRATE_H
+
+/**
+ * struct migrate_meta - Migration metadata
+ * @v: Chosen migration data version, host order
+ * @bswap: Source has opposite endianness
+ * @peer_64b: Source uses 64-bit void *
+ * @time_64b: Source uses 64-bit time_t
+ * @flow_size: Size of union flow in source
+ * @flow_sidx_size: Size of struct flow_sidx in source
+ */
+struct migrate_meta {
+ uint32_t v;
+ bool bswap;
+ bool source_64b;
+ bool time_64b;
+ size_t flow_size;
+ size_t flow_sidx_size;
+};
+
+/**
+ * union migrate_header - Migration header from source
+ * @magic: 0xB1BB1D1B0BB1D1B0, host order
+ * @version: Source sends highest known, target aborts if unsupported
+ * @voidp_size: sizeof(void *), network order
+ * @time_t_size: sizeof(time_t), network order
+ * @flow_size: sizeof(union flow), network order
+ * @flow_sidx_size: sizeof(struct flow_sidx_t), network order
+ * @unused: Go figure
+ */
+union migrate_header {
+ struct {
+ uint64_t magic;
+ uint32_t version;
+ uint32_t voidp_size;
+ uint32_t time_t_size;
+ uint32_t flow_size;
+ uint32_t flow_sidx_size;
+ };
+ uint8_t unused[4096];
+} __attribute__((packed));
+
+/**
+ * struct migrate_data - Data sections for given source version
+ * @v: Source version this applies to, host order
+ * @sections: Array of data sections, NULL-terminated
+ */
+struct migrate_data {
+ uint32_t v;
+ struct iovec *sections;
+};
+
+/**
+ * struct migrate_handler - Function to handle a specific data section
+ * @fn: Function pointer taking pointer to context and metadata
+ */
+struct migrate_handler {
+ int (*fn)(struct ctx *c, struct migrate_meta *m);
+};
+
+/**
+ * struct migrate_target_handlers - Versioned sets of migration target handlers
+ * @v: Source version this applies to, host order
+ * @pre: Set of functions to execute in target before data copy
+ * @post: Set of functions to execute in target after data copy
+ */
+struct migrate_target_handlers {
+ uint32_t v;
+ struct migrate_handler *pre;
+ struct migrate_handler *post;
+};
+
+int migrate_source_pre(struct ctx *c, struct migrate_meta *m);
+int migrate_source(int fd, const struct migrate_meta *m);
+void migrate_source_post(struct ctx *c, struct migrate_meta *m);
+
+int migrate_target_read_header(int fd, struct migrate_meta *m);
+int migrate_target_pre(struct ctx *c, struct migrate_meta *m);
+int migrate_target(int fd, const struct migrate_meta *m);
+void migrate_target_post(struct ctx *c, struct migrate_meta *m);
+
+#endif /* MIGRATE_H */
diff --git a/passt.c b/passt.c
index b1c8ab6..184d4e5 100644
--- a/passt.c
+++ b/passt.c
@@ -358,7 +358,7 @@ loop:
vu_kick_cb(c.vdev, ref, &now);
break;
case EPOLL_TYPE_VHOST_MIGRATION:
- vu_migrate(c.vdev, eventmask);
+ vu_migrate(&c, eventmask);
break;
default:
/* Can't happen */
diff --git a/vu_common.c b/vu_common.c
index 2c12dca..6c346c8 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -5,6 +5,7 @@
* common_vu.c - vhost-user common UDP and TCP functions
*/
+#include <errno.h>
#include <unistd.h>
#include <sys/uio.h>
#include <sys/eventfd.h>
@@ -17,6 +18,7 @@
#include "vhost_user.h"
#include "pcap.h"
#include "vu_common.h"
+#include "migrate.h"
#define VU_MAX_TX_BUFFER_NB 2
@@ -305,48 +307,90 @@ err:
}
/**
- * vu_migrate() - Send/receive passt insternal state to/from QEMU
- * @vdev: vhost-user device
+ * vu_migrate_source() - Migration as source, send state to hypervisor
+ * @c: Execution context
+ * @fd: File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+static int vu_migrate_source(struct ctx *c, int fd)
+{
+ struct migrate_meta m;
+ int rc;
+
+ if ((rc = migrate_source_pre(c, &m))) {
+ err("Source pre-migration failed: %s, abort", strerror_(rc));
+ return rc;
+ }
+
+ debug("Saving backend state");
+
+ rc = migrate_source(fd, &m);
+ if (rc)
+ err("Source migration failed: %s", strerror_(rc));
+ else
+ migrate_source_post(c, &m);
+
+ return rc;
+}
+
+/**
+ * vu_migrate_target() - Migration as target, receive state from hypervisor
+ * @c: Execution context
+ * @fd: File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+static int vu_migrate_target(struct ctx *c, int fd)
+{
+ struct migrate_meta m;
+ int rc;
+
+ rc = migrate_target_read_header(fd, &m);
+ if (rc) {
+ err("Migration header check failed: %s, abort", strerror_(rc));
+ return rc;
+ }
+
+ if ((rc = migrate_target_pre(c, &m))) {
+ err("Target pre-migration failed: %s, abort", strerror_(rc));
+ return rc;
+ }
+
+ debug("Loading backend state");
+
+ rc = migrate_target(fd, &m);
+ if (rc)
+ err("Target migration failed: %s", strerror_(rc));
+ else
+ migrate_target_post(c, &m);
+
+ return rc;
+}
+
+/**
+ * vu_migrate() - Send/receive passt internal state to/from QEMU
+ * @c: Execution context
* @events: epoll events
*/
-void vu_migrate(struct vu_dev *vdev, uint32_t events)
+void vu_migrate(struct ctx *c, uint32_t events)
{
- int ret;
+ struct vu_dev *vdev = c->vdev;
+ int rc = EIO;
- /* TODO: collect/set passt internal state
- * and use vdev->device_state_fd to send/receive it
- */
debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
- if (events & EPOLLOUT) {
- debug("Saving backend state");
-
- /* send some stuff */
- ret = write(vdev->device_state_fd, "PASST", 6);
- /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
- vdev->device_state_result = ret == -1 ? -1 : 0;
- /* Closing the file descriptor signals the end of transfer */
- epoll_del(vdev->context, vdev->device_state_fd);
- close(vdev->device_state_fd);
- vdev->device_state_fd = -1;
- } else if (events & EPOLLIN) {
- char buf[6];
-
- debug("Loading backend state");
- /* read some stuff */
- ret = read(vdev->device_state_fd, buf, sizeof(buf));
- /* value to be returned by VHOST_USER_CHECK_DEVICE_STATE */
- if (ret != sizeof(buf)) {
- vdev->device_state_result = -1;
- } else {
- ret = strncmp(buf, "PASST", sizeof(buf));
- vdev->device_state_result = ret == 0 ? 0 : -1;
- }
- } else if (events & EPOLLHUP) {
- debug("Closing migration channel");
- /* The end of file signals the end of the transfer. */
- epoll_del(vdev->context, vdev->device_state_fd);
- close(vdev->device_state_fd);
- vdev->device_state_fd = -1;
- }
+ if (events & EPOLLOUT)
+ rc = vu_migrate_source(c, vdev->device_state_fd);
+ else if (events & EPOLLIN)
+ rc = vu_migrate_target(c, vdev->device_state_fd);
+
+ /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */
+
+ vdev->device_state_result = rc;
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, vdev->device_state_fd, NULL);
+ debug("Closing migration channel");
+ close(vdev->device_state_fd);
+ vdev->device_state_fd = -1;
}
diff --git a/vu_common.h b/vu_common.h
index d56c021..69c4006 100644
--- a/vu_common.h
+++ b/vu_common.h
@@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now);
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
-void vu_migrate(struct vu_dev *vdev, uint32_t events);
+void vu_migrate(struct ctx *c, uint32_t events);
#endif /* VU_COMMON_H */
--
@@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now);
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
-void vu_migrate(struct vu_dev *vdev, uint32_t events);
+void vu_migrate(struct ctx *c, uint32_t events);
#endif /* VU_COMMON_H */
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 08/20] Introduce passt-repair
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (6 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 07/20] Introduce facilities for guest migration on top of vhost-user infrastructure Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 09/20] Add interfaces and configuration bits for passt-repair Stefano Brivio
` (11 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
A privileged helper to set/clear TCP_REPAIR on sockets on behalf of
passt. Not used yet.
From David's patch: add it to .gitignore, like our other executable
targets.
Co-authored-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
.gitignore | 1 +
Makefile | 10 +++--
passt-repair.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 125 insertions(+), 3 deletions(-)
create mode 100644 passt-repair.c
diff --git a/.gitignore b/.gitignore
index d1c8be9..5824a71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
/passt.avx2
/pasta
/pasta.avx2
+/passt-repair
/qrap
/pasta.1
/seccomp.h
diff --git a/Makefile b/Makefile
index 1383875..1b71cb0 100644
--- a/Makefile
+++ b/Makefile
@@ -42,7 +42,8 @@ PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
vhost_user.c virtio.c vu_common.c
QRAP_SRCS = qrap.c
-SRCS = $(PASST_SRCS) $(QRAP_SRCS)
+PASST_REPAIR_SRCS = passt-repair.c
+SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
MANPAGES = passt.1 pasta.1 qrap.1
@@ -72,9 +73,9 @@ mandir ?= $(datarootdir)/man
man1dir ?= $(mandir)/man1
ifeq ($(TARGET_ARCH),x86_64)
-BIN := passt passt.avx2 pasta pasta.avx2 qrap
+BIN := passt passt.avx2 pasta pasta.avx2 qrap passt-repair
else
-BIN := passt pasta qrap
+BIN := passt pasta qrap passt-repair
endif
all: $(BIN) $(MANPAGES) docs
@@ -101,6 +102,9 @@ pasta.avx2 pasta.1 pasta: pasta%: passt%
qrap: $(QRAP_SRCS) passt.h
$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) -DARCH=\"$(TARGET_ARCH)\" $(QRAP_SRCS) -o qrap $(LDFLAGS)
+passt-repair: $(PASST_REPAIR_SRCS)
+ $(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(PASST_REPAIR_SRCS) -o passt-repair $(LDFLAGS)
+
valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction \
rt_sigreturn getpid gettid kill clock_gettime mmap \
mmap2 munmap open unlink gettimeofday futex statx \
diff --git a/passt-repair.c b/passt-repair.c
new file mode 100644
index 0000000..988a52c
--- /dev/null
+++ b/passt-repair.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * passt-repair.c - Privileged helper to set/clear TCP_REPAIR on sockets
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ *
+ * Connect to passt via UNIX domain socket, receive sockets via SCM_RIGHTS along
+ * with byte commands mapping to TCP_REPAIR values, and switch repair mode on or
+ * off. Reply by echoing the command. Exit on EOF.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <unistd.h>
+#include <netdb.h>
+
+#include <netinet/tcp.h>
+
+#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+
+int main(int argc, char **argv)
+{
+ char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
+ __attribute__ ((aligned(__alignof__(struct cmsghdr))));
+ struct sockaddr_un a = { AF_UNIX, "" };
+ int fds[SCM_MAX_FD], s, ret, i, n;
+ int8_t cmd = INT8_MAX;
+ struct cmsghdr *cmsg;
+ struct msghdr msg;
+ struct iovec iov;
+
+ iov = (struct iovec){ &cmd, sizeof(cmd) };
+ msg = (struct msghdr){ NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
+ cmsg = CMSG_FIRSTHDR(&msg);
+
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s PATH\n", argv[0]);
+ return -1;
+ }
+
+ ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
+ if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
+ fprintf(stderr, "Invalid socket path: %s\n", argv[1]);
+ return -1;
+ }
+
+ if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ perror("Failed to create AF_UNIX socket");
+ return -1;
+ }
+
+ if (connect(s, (struct sockaddr *)&a, sizeof(a))) {
+ fprintf(stderr, "Failed to connect to %s: %s\n", argv[1],
+ strerror(errno));
+ return -1;
+ }
+
+loop:
+ ret = recvmsg(s, &msg, 0);
+ if (ret < 0) {
+ perror("Failed to receive message");
+ return -1;
+ }
+
+ if (!ret) /* Done */
+ return 0;
+
+ if (!cmsg ||
+ cmsg->cmsg_len < CMSG_LEN(sizeof(int)) ||
+ cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) ||
+ cmsg->cmsg_type != SCM_RIGHTS)
+ return -1;
+
+ n = cmsg->cmsg_len / CMSG_LEN(sizeof(int));
+ memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n);
+
+ if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF &&
+ cmd != TCP_REPAIR_OFF_NO_WP) {
+ fprintf(stderr, "Unsupported command 0x%04x\n", cmd);
+ return -1;
+ }
+
+ for (i = 0; i < n; i++) {
+ int o = cmd;
+
+ if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &o, sizeof(o))) {
+ fprintf(stderr,
+ "Setting TCP_REPAIR to %i on socket %i: %s", o,
+ fds[i], strerror(errno));
+ return -1;
+ }
+
+ /* Confirm setting by echoing the command back */
+ if (send(s, &cmd, sizeof(cmd), 0) < 0) {
+ fprintf(stderr, "Reply to command %i: %s\n",
+ o, strerror(errno));
+ return -1;
+ }
+ }
+
+ goto loop;
+
+ return 0;
+}
--
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * passt-repair.c - Privileged helper to set/clear TCP_REPAIR on sockets
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ *
+ * Connect to passt via UNIX domain socket, receive sockets via SCM_RIGHTS along
+ * with byte commands mapping to TCP_REPAIR values, and switch repair mode on or
+ * off. Reply by echoing the command. Exit on EOF.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <unistd.h>
+#include <netdb.h>
+
+#include <netinet/tcp.h>
+
+#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+
+int main(int argc, char **argv)
+{
+ char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
+ __attribute__ ((aligned(__alignof__(struct cmsghdr))));
+ struct sockaddr_un a = { AF_UNIX, "" };
+ int fds[SCM_MAX_FD], s, ret, i, n;
+ int8_t cmd = INT8_MAX;
+ struct cmsghdr *cmsg;
+ struct msghdr msg;
+ struct iovec iov;
+
+ iov = (struct iovec){ &cmd, sizeof(cmd) };
+ msg = (struct msghdr){ NULL, 0, &iov, 1, buf, sizeof(buf), 0 };
+ cmsg = CMSG_FIRSTHDR(&msg);
+
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s PATH\n", argv[0]);
+ return -1;
+ }
+
+ ret = snprintf(a.sun_path, sizeof(a.sun_path), "%s", argv[1]);
+ if (ret <= 0 || ret >= (int)sizeof(a.sun_path)) {
+ fprintf(stderr, "Invalid socket path: %s\n", argv[1]);
+ return -1;
+ }
+
+ if ((s = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+ perror("Failed to create AF_UNIX socket");
+ return -1;
+ }
+
+ if (connect(s, (struct sockaddr *)&a, sizeof(a))) {
+ fprintf(stderr, "Failed to connect to %s: %s\n", argv[1],
+ strerror(errno));
+ return -1;
+ }
+
+loop:
+ ret = recvmsg(s, &msg, 0);
+ if (ret < 0) {
+ perror("Failed to receive message");
+ return -1;
+ }
+
+ if (!ret) /* Done */
+ return 0;
+
+ if (!cmsg ||
+ cmsg->cmsg_len < CMSG_LEN(sizeof(int)) ||
+ cmsg->cmsg_len > CMSG_LEN(sizeof(int) * SCM_MAX_FD) ||
+ cmsg->cmsg_type != SCM_RIGHTS)
+ return -1;
+
+ n = cmsg->cmsg_len / CMSG_LEN(sizeof(int));
+ memcpy(fds, CMSG_DATA(cmsg), sizeof(int) * n);
+
+ if (cmd != TCP_REPAIR_ON && cmd != TCP_REPAIR_OFF &&
+ cmd != TCP_REPAIR_OFF_NO_WP) {
+ fprintf(stderr, "Unsupported command 0x%04x\n", cmd);
+ return -1;
+ }
+
+ for (i = 0; i < n; i++) {
+ int o = cmd;
+
+ if (setsockopt(fds[i], SOL_TCP, TCP_REPAIR, &o, sizeof(o))) {
+ fprintf(stderr,
+ "Setting TCP_REPAIR to %i on socket %i: %s", o,
+ fds[i], strerror(errno));
+ return -1;
+ }
+
+ /* Confirm setting by echoing the command back */
+ if (send(s, &cmd, sizeof(cmd), 0) < 0) {
+ fprintf(stderr, "Reply to command %i: %s\n",
+ o, strerror(errno));
+ return -1;
+ }
+ }
+
+ goto loop;
+
+ return 0;
+}
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 09/20] Add interfaces and configuration bits for passt-repair
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (7 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 08/20] Introduce passt-repair Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 10/20] flow, tcp: Basic pre-migration source handler to dump sequence numbers Stefano Brivio
` (10 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
In vhost-user mode, by default, create a second UNIX domain socket
accepting connections from passt-repair, with the usual listener
socket.
When we need to set or clear TCP_REPAIR on sockets, we'll send them
via SCM_RIGHTS to passt-repair, who sets the socket option values we
ask for.
To that end, introduce batched functions to request TCP_REPAIR
settings on sockets, so that we don't have to send a single message
for each socket, on migration. When needed, repair_flush() will
send the message and check for the reply.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
Makefile | 12 ++--
conf.c | 44 ++++++++++--
epoll_type.h | 4 ++
passt.1 | 11 +++
passt.c | 9 +++
passt.h | 7 ++
repair.c | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++
repair.h | 16 +++++
tap.c | 65 +----------------
util.c | 62 +++++++++++++++++
util.h | 1 +
11 files changed, 352 insertions(+), 72 deletions(-)
create mode 100644 repair.c
create mode 100644 repair.h
diff --git a/Makefile b/Makefile
index 1b71cb0..f67a20b 100644
--- a/Makefile
+++ b/Makefile
@@ -38,9 +38,9 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
- ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c tap.c \
- tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \
- vhost_user.c virtio.c vu_common.c
+ ndp.c netlink.c migrate.c packet.c passt.c pasta.c pcap.c pif.c \
+ repair.c tap.c tcp.c tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c \
+ udp_vu.c util.c vhost_user.c virtio.c vu_common.c
QRAP_SRCS = qrap.c
PASST_REPAIR_SRCS = passt-repair.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS) $(PASST_REPAIR_SRCS)
@@ -50,9 +50,9 @@ MANPAGES = passt.1 pasta.1 qrap.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
lineread.h log.h migrate.h ndp.h netlink.h packet.h passt.h pasta.h \
- pcap.h pif.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h \
- tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h \
- vhost_user.h virtio.h vu_common.h
+ pcap.h pif.h repair.h siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h \
+ tcp_internal.h tcp_splice.h tcp_vu.h udp.h udp_flow.h udp_internal.h \
+ udp_vu.h util.h vhost_user.h virtio.h vu_common.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <sys/random.h>\nint main(){int a=getrandom(0, 0, 0);}
diff --git a/conf.c b/conf.c
index df2b016..9ef79a4 100644
--- a/conf.c
+++ b/conf.c
@@ -816,6 +816,9 @@ static void usage(const char *name, FILE *f, int status)
" UNIX domain socket is provided by -s option\n"
" --print-capabilities print back-end capabilities in JSON format,\n"
" only meaningful for vhost-user mode\n");
+ FPRINTF(f,
+ " --repair-path PATH path for passt-repair(1)\n"
+ " default: append '.repair' to UNIX domain path\n");
}
FPRINTF(f,
@@ -1240,8 +1243,25 @@ static void conf_nat(const char *arg, struct in_addr *addr4,
*/
static void conf_open_files(struct ctx *c)
{
- if (c->mode != MODE_PASTA && c->fd_tap == -1)
- c->fd_tap_listen = tap_sock_unix_open(c->sock_path);
+ if (c->mode != MODE_PASTA && c->fd_tap == -1) {
+ c->fd_tap_listen = sock_unix(c->sock_path);
+
+ if (c->mode == MODE_VU && strcmp(c->repair_path, "none")) {
+ if (!*c->repair_path &&
+ snprintf_check(c->repair_path,
+ sizeof(c->repair_path), "%s.repair",
+ c->sock_path)) {
+ warn("passt-repair path %s not usable",
+ c->repair_path);
+ c->fd_repair_listen = -1;
+ } else {
+ c->fd_repair_listen = sock_unix(c->repair_path);
+ }
+ } else {
+ c->fd_repair_listen = -1;
+ }
+ c->fd_repair = -1;
+ }
if (*c->pidfile) {
c->pidfile_fd = output_file_open(c->pidfile, O_WRONLY);
@@ -1354,9 +1374,12 @@ void conf(struct ctx *c, int argc, char **argv)
{"host-lo-to-ns-lo", no_argument, NULL, 23 },
{"dns-host", required_argument, NULL, 24 },
{"vhost-user", no_argument, NULL, 25 },
+
/* vhost-user backend program convention */
{"print-capabilities", no_argument, NULL, 26 },
{"socket-path", required_argument, NULL, 's' },
+
+ {"repair-path", required_argument, NULL, 27 },
{ 0 },
};
const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt";
@@ -1748,6 +1771,9 @@ void conf(struct ctx *c, int argc, char **argv)
case 'D':
/* Handle these later, once addresses are configured */
break;
+ case 27:
+ /* Handle this once we checked --vhost-user */
+ break;
case 'h':
usage(argv[0], stdout, EXIT_SUCCESS);
break;
@@ -1824,8 +1850,8 @@ void conf(struct ctx *c, int argc, char **argv)
if (c->ifi4 && IN4_IS_ADDR_UNSPECIFIED(&c->ip4.guest_gw))
c->no_dhcp = 1;
- /* Inbound port options & DNS can be parsed now (after IPv4/IPv6
- * settings)
+ /* Inbound port options, DNS, and --repair-path can be parsed now, after
+ * IPv4/IPv6 settings and --vhost-user.
*/
fwd_probe_ephemeral();
udp_portmap_clear();
@@ -1871,6 +1897,16 @@ void conf(struct ctx *c, int argc, char **argv)
}
die("Cannot use DNS address %s", optarg);
+ } else if (name == 27) {
+ if (c->mode != MODE_VU && strcmp(optarg, "none"))
+ die("--repair-path is for vhost-user mode only");
+
+ if (snprintf_check(c->repair_path,
+ sizeof(c->repair_path), "%s",
+ optarg))
+ die("Invalid passt-repair path: %s", optarg);
+
+ break;
}
} while (name != -1);
diff --git a/epoll_type.h b/epoll_type.h
index fd9eac3..706238a 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -42,6 +42,10 @@ enum epoll_type {
EPOLL_TYPE_VHOST_KICK,
/* vhost-user migration socket */
EPOLL_TYPE_VHOST_MIGRATION,
+ /* TCP_REPAIR helper listening socket */
+ EPOLL_TYPE_REPAIR_LISTEN,
+ /* TCP_REPAIR helper socket */
+ EPOLL_TYPE_REPAIR,
EPOLL_NUM_TYPES,
};
diff --git a/passt.1 b/passt.1
index d9cd33e..63a3a01 100644
--- a/passt.1
+++ b/passt.1
@@ -418,6 +418,17 @@ Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR.
.BR \-\-print-capabilities
Print back-end capabilities in JSON format, only meaningful for vhost-user mode.
+.TP
+.BR \-\-repair-path " " \fIpath
+Path for UNIX domain socket used by the \fBpasst-repair\fR(1) helper to connect
+to \fBpasst\fR in order to set or clear the TCP_REPAIR option on sockets, during
+migration. \fB--repair-path none\fR disables this interface (if you need to
+specify a socket path called "none" you can prefix the path by \fI./\fR).
+
+Default, for \-\-vhost-user mode only, is to append \fI.repair\fR to the path
+chosen for the hypervisor UNIX domain socket. No socket is created if not in
+\-\-vhost-user mode.
+
.TP
.BR \-F ", " \-\-fd " " \fIFD
Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened
diff --git a/passt.c b/passt.c
index 184d4e5..1fa2ddd 100644
--- a/passt.c
+++ b/passt.c
@@ -51,6 +51,7 @@
#include "tcp_splice.h"
#include "ndp.h"
#include "vu_common.h"
+#include "repair.h"
#define EPOLL_EVENTS 8
@@ -76,6 +77,8 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
[EPOLL_TYPE_VHOST_MIGRATION] = "vhost-user migration socket",
+ [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
+ [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@@ -360,6 +363,12 @@ loop:
case EPOLL_TYPE_VHOST_MIGRATION:
vu_migrate(&c, eventmask);
break;
+ case EPOLL_TYPE_REPAIR_LISTEN:
+ repair_listen_handler(&c, eventmask);
+ break;
+ case EPOLL_TYPE_REPAIR:
+ repair_handler(&c, eventmask);
+ break;
default:
/* Can't happen */
ASSERT(0);
diff --git a/passt.h b/passt.h
index 0dd4efa..85b0a10 100644
--- a/passt.h
+++ b/passt.h
@@ -20,6 +20,7 @@ union epoll_ref;
#include "siphash.h"
#include "ip.h"
#include "inany.h"
+#include "migrate.h"
#include "flow.h"
#include "icmp.h"
#include "fwd.h"
@@ -193,6 +194,7 @@ struct ip6_ctx {
* @foreground: Run in foreground, don't log to stderr by default
* @nofile: Maximum number of open files (ulimit -n)
* @sock_path: Path for UNIX domain socket
+ * @repair_path: TCP_REPAIR helper path, can be "none", empty for default
* @pcap: Path for packet capture file
* @pidfile: Path to PID file, empty string if not configured
* @pidfile_fd: File descriptor for PID file, -1 if none
@@ -203,6 +205,8 @@ struct ip6_ctx {
* @epollfd: File descriptor for epoll instance
* @fd_tap_listen: File descriptor for listening AF_UNIX socket, if any
* @fd_tap: AF_UNIX socket, tuntap device, or pre-opened socket
+ * @fd_repair_listen: File descriptor for listening TCP_REPAIR socket, if any
+ * @fd_repair: Connected AF_UNIX socket for TCP_REPAIR helper
* @our_tap_mac: Pasta/passt's MAC on the tap link
* @guest_mac: MAC address of guest or namespace, seen or configured
* @hash_secret: 128-bit secret for siphash functions
@@ -244,6 +248,7 @@ struct ctx {
int foreground;
int nofile;
char sock_path[UNIX_PATH_MAX];
+ char repair_path[UNIX_PATH_MAX];
char pcap[PATH_MAX];
char pidfile[PATH_MAX];
@@ -260,6 +265,8 @@ struct ctx {
int epollfd;
int fd_tap_listen;
int fd_tap;
+ int fd_repair_listen;
+ int fd_repair;
unsigned char our_tap_mac[ETH_ALEN];
unsigned char guest_mac[ETH_ALEN];
uint64_t hash_secret[2];
diff --git a/repair.c b/repair.c
new file mode 100644
index 0000000..6151927
--- /dev/null
+++ b/repair.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * repair.c - Interface (server) for passt-repair, set/clear TCP_REPAIR
+ *
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <errno.h>
+#include <sys/uio.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "inany.h"
+#include "flow.h"
+#include "flow_table.h"
+
+#include "repair.h"
+
+#define SCM_MAX_FD 253 /* From Linux kernel (include/net/scm.h), not in UAPI */
+
+static int repair_fds[SCM_MAX_FD];
+static int repair_cmd;
+static int repair_nfds;
+
+/**
+ * repair_sock_init() - Start listening for connections on helper socket
+ * @c: Execution context
+ */
+void repair_sock_init(const struct ctx *c)
+{
+ union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR_LISTEN };
+ struct epoll_event ev = { 0 };
+
+ listen(c->fd_repair_listen, 0);
+
+ ref.fd = c->fd_repair_listen;
+ ev.events = EPOLLIN | EPOLLHUP | EPOLLET;
+ ev.data.u64 = ref.u64;
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair_listen, &ev);
+}
+
+/**
+ * repair_listen_handler() - Handle events on TCP_REPAIR helper listening socket
+ * @c: Execution context
+ * @events: epoll events
+ */
+void repair_listen_handler(struct ctx *c, uint32_t events)
+{
+ union epoll_ref ref = { .type = EPOLL_TYPE_REPAIR };
+ struct epoll_event ev = { 0 };
+ struct ucred ucred;
+ socklen_t len;
+
+ if (events != EPOLLIN) {
+ debug("Spurious event 0x%04x on TCP_REPAIR helper socket",
+ events);
+ return;
+ }
+
+ len = sizeof(ucred);
+
+ /* Another client is already connected: accept and close right away. */
+ if (c->fd_repair != -1) {
+ int discard = accept4(c->fd_repair_listen, NULL, NULL,
+ SOCK_NONBLOCK);
+
+ if (discard == -1)
+ return;
+
+ if (!getsockopt(discard, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
+ info("Discarding TCP_REPAIR helper, PID %i", ucred.pid);
+
+ close(discard);
+ return;
+ }
+
+ c->fd_repair = accept4(c->fd_repair_listen, NULL, NULL, 0);
+
+ if (!getsockopt(c->fd_repair, SOL_SOCKET, SO_PEERCRED, &ucred, &len))
+ info("Accepted TCP_REPAIR helper, PID %i", ucred.pid);
+
+ ref.fd = c->fd_repair;
+ ev.events = EPOLLHUP | EPOLLET;
+ ev.data.u64 = ref.u64;
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_repair, &ev);
+}
+
+/**
+ * repair_close() - Close connection to TCP_REPAIR helper
+ * @c: Execution context
+ */
+void repair_close(struct ctx *c)
+{
+ debug("Closing TCP_REPAIR helper socket");
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_repair, NULL);
+ close(c->fd_repair);
+ c->fd_repair = -1;
+}
+
+/**
+ * repair_handler() - Handle EPOLLHUP and EPOLLERR on TCP_REPAIR helper socket
+ * @c: Execution context
+ * @events: epoll events
+ */
+void repair_handler(struct ctx *c, uint32_t events)
+{
+ (void)events;
+
+ repair_close(c);
+}
+
+/**
+ * repair_flush() - Flush current set of sockets to helper, with current command
+ * @c: Execution context
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int repair_flush(struct ctx *c)
+{
+ struct iovec iov = { &((int8_t){ repair_cmd }), sizeof(int8_t) };
+ char buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FD)]
+ __attribute__ ((aligned(__alignof__(struct cmsghdr))));
+ struct cmsghdr *cmsg;
+ struct msghdr msg;
+
+ if (!repair_nfds)
+ return 0;
+
+ msg = (struct msghdr){ NULL, 0, &iov, 1,
+ buf, CMSG_SPACE(sizeof(int) * repair_nfds), 0 };
+ cmsg = CMSG_FIRSTHDR(&msg);
+
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(int) * repair_nfds);
+ memcpy(CMSG_DATA(cmsg), repair_fds, sizeof(int) * repair_nfds);
+
+ repair_nfds = 0;
+
+ if (sendmsg(c->fd_repair, &msg, 0) < 0) {
+ int ret = -errno;
+ err_perror("Failed to send sockets to TCP_REPAIR helper");
+ repair_close(c);
+ return ret;
+ }
+
+ if (recv(c->fd_repair, &((int8_t){ 0 }), 1, 0) < 0) {
+ int ret = -errno;
+ err_perror("Failed to receive reply from TCP_REPAIR helper");
+ repair_close(c);
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * repair_flush() - Add socket to TCP_REPAIR set with given command
+ * @c: Execution context
+ * @s: Socket to add
+ * @cmd: TCP_REPAIR_ON, TCP_REPAIR_OFF, or TCP_REPAIR_OFF_NO_WP
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+/* cppcheck-suppress unusedFunction */
+int repair_set(struct ctx *c, int s, int cmd)
+{
+ int rc;
+
+ if (repair_nfds && repair_cmd != cmd) {
+ if ((rc = repair_flush(c)))
+ return rc;
+ }
+
+ repair_cmd = cmd;
+ repair_fds[repair_nfds++] = s;
+
+ if (repair_nfds >= SCM_MAX_FD) {
+ if ((rc = repair_flush(c)))
+ return rc;
+ }
+
+ return 0;
+}
diff --git a/repair.h b/repair.h
new file mode 100644
index 0000000..693c515
--- /dev/null
+++ b/repair.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2025 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef REPAIR_H
+#define REPAIR_H
+
+void repair_sock_init(const struct ctx *c);
+void repair_listen_handler(struct ctx *c, uint32_t events);
+void repair_handler(struct ctx *c, uint32_t events);
+void repair_close(struct ctx *c);
+int repair_flush(struct ctx *c);
+int repair_set(struct ctx *c, int s, int cmd);
+
+#endif /* REPAIR_H */
diff --git a/tap.c b/tap.c
index 772648f..3659aab 100644
--- a/tap.c
+++ b/tap.c
@@ -56,6 +56,7 @@
#include "netlink.h"
#include "pasta.h"
#include "packet.h"
+#include "repair.h"
#include "tap.h"
#include "log.h"
#include "vhost_user.h"
@@ -1151,68 +1152,6 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
tap_pasta_input(c, now);
}
-/**
- * tap_sock_unix_open() - Create and bind AF_UNIX socket
- * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
- *
- * Return: socket descriptor on success, won't return on failure
- */
-int tap_sock_unix_open(char *sock_path)
-{
- int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
- struct sockaddr_un addr = {
- .sun_family = AF_UNIX,
- };
- int i;
-
- if (fd < 0)
- die_perror("Failed to open UNIX domain socket");
-
- for (i = 1; i < UNIX_SOCK_MAX; i++) {
- char *path = addr.sun_path;
- int ex, ret;
-
- if (*sock_path)
- memcpy(path, sock_path, UNIX_PATH_MAX);
- else if (snprintf_check(path, UNIX_PATH_MAX - 1,
- UNIX_SOCK_PATH, i))
- die_perror("Can't build UNIX domain socket path");
-
- ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
- 0);
- if (ex < 0)
- die_perror("Failed to check for UNIX domain conflicts");
-
- ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
- if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
- errno != EACCES)) {
- if (*sock_path)
- die("Socket path %s already in use", path);
-
- close(ex);
- continue;
- }
- close(ex);
-
- unlink(path);
- ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
- if (*sock_path && ret)
- die_perror("Failed to bind UNIX domain socket");
-
- if (!ret)
- break;
- }
-
- if (i == UNIX_SOCK_MAX)
- die_perror("Failed to bind UNIX domain socket");
-
- info("UNIX domain socket bound at %s", addr.sun_path);
- if (!*sock_path)
- memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
-
- return fd;
-}
-
/**
* tap_backend_show_hints() - Give help information to start QEMU
* @c: Execution context
@@ -1423,6 +1362,8 @@ void tap_backend_init(struct ctx *c)
tap_sock_tun_init(c);
break;
case MODE_VU:
+ repair_sock_init(c);
+ /* fall through */
case MODE_PASST:
tap_sock_unix_init(c);
diff --git a/util.c b/util.c
index f2eef93..0e0f8a4 100644
--- a/util.c
+++ b/util.c
@@ -178,6 +178,68 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
return fd;
}
+/**
+ * sock_unix() - Create and bind AF_UNIX socket
+ * @sock_path: Socket path. If empty, set on return (UNIX_SOCK_PATH as prefix)
+ *
+ * Return: socket descriptor on success, won't return on failure
+ */
+int sock_unix(char *sock_path)
+{
+ int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ int i;
+
+ if (fd < 0)
+ die_perror("Failed to open UNIX domain socket");
+
+ for (i = 1; i < UNIX_SOCK_MAX; i++) {
+ char *path = addr.sun_path;
+ int ex, ret;
+
+ if (*sock_path)
+ memcpy(path, sock_path, UNIX_PATH_MAX);
+ else if (snprintf_check(path, UNIX_PATH_MAX - 1,
+ UNIX_SOCK_PATH, i))
+ die_perror("Can't build UNIX domain socket path");
+
+ ex = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+ 0);
+ if (ex < 0)
+ die_perror("Failed to check for UNIX domain conflicts");
+
+ ret = connect(ex, (const struct sockaddr *)&addr, sizeof(addr));
+ if (!ret || (errno != ENOENT && errno != ECONNREFUSED &&
+ errno != EACCES)) {
+ if (*sock_path)
+ die("Socket path %s already in use", path);
+
+ close(ex);
+ continue;
+ }
+ close(ex);
+
+ unlink(path);
+ ret = bind(fd, (const struct sockaddr *)&addr, sizeof(addr));
+ if (*sock_path && ret)
+ die_perror("Failed to bind UNIX domain socket");
+
+ if (!ret)
+ break;
+ }
+
+ if (i == UNIX_SOCK_MAX)
+ die_perror("Failed to bind UNIX domain socket");
+
+ info("UNIX domain socket bound at %s", addr.sun_path);
+ if (!*sock_path)
+ memcpy(sock_path, addr.sun_path, UNIX_PATH_MAX);
+
+ return fd;
+}
+
/**
* sock_probe_mem() - Check if setting high SO_SNDBUF and SO_RCVBUF is allowed
* @c: Execution context
diff --git a/util.h b/util.h
index 6ae8588..6924d08 100644
--- a/util.h
+++ b/util.h
@@ -185,6 +185,7 @@ struct ctx;
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
const char *ifname, bool v6only, uint32_t data);
+int sock_unix(char *sock_path);
void sock_probe_mem(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);
--
@@ -185,6 +185,7 @@ struct ctx;
int sock_l4_sa(const struct ctx *c, enum epoll_type type,
const void *sa, socklen_t sl,
const char *ifname, bool v6only, uint32_t data);
+int sock_unix(char *sock_path);
void sock_probe_mem(struct ctx *c);
long timespec_diff_ms(const struct timespec *a, const struct timespec *b);
int64_t timespec_diff_us(const struct timespec *a, const struct timespec *b);
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 10/20] flow, tcp: Basic pre-migration source handler to dump sequence numbers
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (8 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 09/20] Add interfaces and configuration bits for passt-repair Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 11/20] migrate: vu_migrate_{source,target}() aren't actually vu speciic Stefano Brivio
` (9 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
Very much draft quality, but it works. Ask passt-repair to switch
TCP sockets to repair mode and dump their current sequence numbers to
the flow table, which will be transferred and used by the target in
the next step.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
flow.c | 43 +++++++++++++++++++++++++++++++++++++++++
flow.h | 1 +
migrate.c | 1 +
tcp.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
tcp_conn.h | 5 +++++
5 files changed, 106 insertions(+)
diff --git a/flow.c b/flow.c
index ee1221b..e7148b2 100644
--- a/flow.c
+++ b/flow.c
@@ -19,6 +19,7 @@
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
+#include "repair.h"
const char *flow_state_str[] = {
[FLOW_STATE_FREE] = "FREE",
@@ -874,6 +875,48 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
*last_next = FLOW_MAX;
}
+/**
+ * flow_migrate_source_pre() - Prepare all source flows for migration
+ * @c: Execution context
+ * @m: Migration metadata
+ *
+ * Return: 0 on success
+ */
+int flow_migrate_source_pre(struct ctx *c, struct migrate_meta *m)
+{
+ unsigned i;
+ int rc;
+
+ (void)m;
+
+ for (i = 0; i < FLOW_MAX; i++) { /* TODO: iterator with skip */
+ union flow *flow = &flowtab[i];
+
+ if (flow->f.state == FLOW_STATE_FREE)
+ i += flow->free.n - 1;
+ else if (flow->f.state == FLOW_STATE_ACTIVE &&
+ flow->f.type == FLOW_TCP)
+ rc = tcp_flow_repair_on(c, &flow->tcp);
+
+ if (rc)
+ return rc; /* TODO: rollback */
+ }
+
+ repair_flush(c); /* TODO: move to TCP logic */
+
+ for (i = 0; i < FLOW_MAX; i++) { /* TODO: iterator with skip */
+ union flow *flow = &flowtab[i];
+
+ if (flow->f.state == FLOW_STATE_FREE)
+ i += flow->free.n - 1;
+ else if (flow->f.state == FLOW_STATE_ACTIVE &&
+ flow->f.type == FLOW_TCP)
+ tcp_flow_dump_seq(c, &flow->tcp);
+ }
+
+ return 0;
+}
+
/**
* flow_init() - Initialise flow related data structures
*/
diff --git a/flow.h b/flow.h
index 8eb5964..ff390a6 100644
--- a/flow.h
+++ b/flow.h
@@ -255,6 +255,7 @@ union flow;
void flow_init(void);
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
+int flow_migrate_source_pre(struct ctx *c, struct migrate_meta *m);
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
__attribute__((format(printf, 3, 4)));
diff --git a/migrate.c b/migrate.c
index 9ddac8f..10cb242 100644
--- a/migrate.c
+++ b/migrate.c
@@ -62,6 +62,7 @@ static struct migrate_data data_versions[] = {
/* Handlers to call in source before sending data */
struct migrate_handler handlers_source_pre[] = {
+ { flow_migrate_source_pre },
{ 0 },
};
diff --git a/tcp.c b/tcp.c
index 7787381..0bd2a02 100644
--- a/tcp.c
+++ b/tcp.c
@@ -299,6 +299,7 @@
#include "log.h"
#include "inany.h"
#include "flow.h"
+#include "repair.h"
#include "linux_dep.h"
#include "flow_table.h"
@@ -868,6 +869,61 @@ void tcp_defer_handler(struct ctx *c)
tcp_payload_flush(c);
}
+/**
+ * tcp_flow_repair_on() - Enable repair mode for a single TCP flow
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+ int rc = 0;
+
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON)))
+ err("Failed to set TCP_REPAIR");
+
+ return rc;
+}
+
+/**
+ * tcp_flow_dump_seq() - Dump sequences for send and receive queues
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ int v, s = conn->sock;
+ socklen_t vlen;
+
+ (void)c;
+
+ vlen = sizeof(v);
+
+ v = TCP_SEND_QUEUE;
+ /* TODO: proper error management and prints */
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
+ return -errno;
+
+ if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd, &vlen))
+ return -errno;
+
+ debug("Send queue sequence %u for socket %i", conn->sock_seq_snd, s);
+
+ v = TCP_RECV_QUEUE;
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
+ return -errno;
+
+ if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv, &vlen))
+ return -errno;
+
+ debug("Receive queue sequence %u for socket %i", conn->sock_seq_rcv, s);
+
+ return 0;
+}
+
/**
* tcp_fill_header() - Fill the TCP header fields for a given TCP segment.
*
diff --git a/tcp_conn.h b/tcp_conn.h
index d342680..0c3e197 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -94,6 +94,9 @@ struct tcp_tap_conn {
uint32_t seq_from_tap;
uint32_t seq_ack_to_tap;
uint32_t seq_init_from_tap;
+
+ uint32_t sock_seq_snd;
+ uint32_t sock_seq_rcv;
};
/**
@@ -140,6 +143,8 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
--
@@ -94,6 +94,9 @@ struct tcp_tap_conn {
uint32_t seq_from_tap;
uint32_t seq_ack_to_tap;
uint32_t seq_init_from_tap;
+
+ uint32_t sock_seq_snd;
+ uint32_t sock_seq_rcv;
};
/**
@@ -140,6 +143,8 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
+int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 11/20] migrate: vu_migrate_{source,target}() aren't actually vu speciic
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (9 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 10/20] flow, tcp: Basic pre-migration source handler to dump sequence numbers Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 12/20] migrate: Move repair_sock_init() to vu_init() Stefano Brivio
` (8 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
From: David Gibson <david@gibson.dropbear.id.au>
vu_migrate_source() and vu_migrate_target() don't directly rely on anything
vhost-user specific - it's just that they'll only be called for vhost-user
so far. They are suitable as general top-level dispatchers for
migration. Move them to migrate.c, rename to migrate_{source,target}() and
make the lower-level functions they call local to migrate.c.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
migrate.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++------
migrate.h | 10 ++-----
vu_common.c | 66 ++-----------------------------------------
3 files changed, 75 insertions(+), 81 deletions(-)
diff --git a/migrate.c b/migrate.c
index 10cb242..0e60475 100644
--- a/migrate.c
+++ b/migrate.c
@@ -98,7 +98,7 @@ struct migrate_target_handlers target_handlers[] = {
*
* Return: 0 on success, error code on failure
*/
-int migrate_source_pre(struct ctx *c, struct migrate_meta *m)
+static int migrate_source_pre(struct ctx *c, struct migrate_meta *m)
{
struct migrate_handler *h;
@@ -113,13 +113,13 @@ int migrate_source_pre(struct ctx *c, struct migrate_meta *m)
}
/**
- * migrate_source() - Perform migration as source: send state to hypervisor
+ * migrate_source_state() - Send device state as migration source
* @fd: Descriptor for state transfer
* @m: Migration metadata
*
* Return: 0 on success, error code on failure
*/
-int migrate_source(int fd, const struct migrate_meta *m)
+static int migrate_source_state(int fd, const struct migrate_meta *m)
{
static struct migrate_data *d;
int count, rc;
@@ -145,7 +145,7 @@ int migrate_source(int fd, const struct migrate_meta *m)
*
* Return: 0 on success, error code on failure
*/
-void migrate_source_post(struct ctx *c, struct migrate_meta *m)
+static void migrate_source_post(struct ctx *c, struct migrate_meta *m)
{
struct migrate_handler *h;
@@ -153,6 +153,34 @@ void migrate_source_post(struct ctx *c, struct migrate_meta *m)
h->fn(c, m);
}
+/**
+ * migrate_source() - Migration as source, send state to hypervisor
+ * @c: Execution context
+ * @fd: File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int migrate_source(struct ctx *c, int fd)
+{
+ struct migrate_meta m;
+ int rc;
+
+ if ((rc = migrate_source_pre(c, &m))) {
+ err("Source pre-migration failed: %s, abort", strerror_(rc));
+ return rc;
+ }
+
+ debug("Saving backend state");
+
+ rc = migrate_source_state(fd, &m);
+ if (rc)
+ err("Source migration failed: %s", strerror_(rc));
+ else
+ migrate_source_post(c, &m);
+
+ return rc;
+}
+
/**
* migrate_target_read_header() - Set metadata in target from source header
* @fd: Descriptor for state transfer
@@ -160,7 +188,7 @@ void migrate_source_post(struct ctx *c, struct migrate_meta *m)
*
* Return: 0 on success, error code on failure
*/
-int migrate_target_read_header(int fd, struct migrate_meta *m)
+static int migrate_target_read_header(int fd, struct migrate_meta *m)
{
static struct migrate_data *d;
union migrate_header h;
@@ -210,7 +238,7 @@ int migrate_target_read_header(int fd, struct migrate_meta *m)
*
* Return: 0 on success, error code on failure
*/
-int migrate_target_pre(struct ctx *c, struct migrate_meta *m)
+static int migrate_target_pre(struct ctx *c, struct migrate_meta *m)
{
struct migrate_target_handlers *th;
struct migrate_handler *h;
@@ -228,7 +256,7 @@ int migrate_target_pre(struct ctx *c, struct migrate_meta *m)
}
/**
- * migrate_target() - Perform migration as target: receive state from hypervisor
+ * migrate_target_state() - Receive device state as migration target
* @fd: Descriptor for state transfer
* @m: Migration metadata
*
@@ -236,7 +264,7 @@ int migrate_target_pre(struct ctx *c, struct migrate_meta *m)
*
* #syscalls:vu readv
*/
-int migrate_target(int fd, const struct migrate_meta *m)
+static int migrate_target_state(int fd, const struct migrate_meta *m)
{
static struct migrate_data *d;
unsigned cnt;
@@ -259,7 +287,7 @@ int migrate_target(int fd, const struct migrate_meta *m)
* @c: Execution context
* @m: Migration metadata
*/
-void migrate_target_post(struct ctx *c, struct migrate_meta *m)
+static void migrate_target_post(struct ctx *c, struct migrate_meta *m)
{
struct migrate_target_handlers *th;
struct migrate_handler *h;
@@ -269,3 +297,37 @@ void migrate_target_post(struct ctx *c, struct migrate_meta *m)
for (h = th->post; h->fn; h++)
h->fn(c, m);
}
+
+/**
+ * migrate_target() - Migration as target, receive state from hypervisor
+ * @c: Execution context
+ * @fd: File descriptor for state transfer
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int migrate_target(struct ctx *c, int fd)
+{
+ struct migrate_meta m;
+ int rc;
+
+ rc = migrate_target_read_header(fd, &m);
+ if (rc) {
+ err("Migration header check failed: %s, abort", strerror_(rc));
+ return rc;
+ }
+
+ if ((rc = migrate_target_pre(c, &m))) {
+ err("Target pre-migration failed: %s, abort", strerror_(rc));
+ return rc;
+ }
+
+ debug("Loading backend state");
+
+ rc = migrate_target_state(fd, &m);
+ if (rc)
+ err("Target migration failed: %s", strerror_(rc));
+ else
+ migrate_target_post(c, &m);
+
+ return rc;
+}
diff --git a/migrate.h b/migrate.h
index 9a68f17..21de70d 100644
--- a/migrate.h
+++ b/migrate.h
@@ -76,13 +76,7 @@ struct migrate_target_handlers {
struct migrate_handler *post;
};
-int migrate_source_pre(struct ctx *c, struct migrate_meta *m);
-int migrate_source(int fd, const struct migrate_meta *m);
-void migrate_source_post(struct ctx *c, struct migrate_meta *m);
-
-int migrate_target_read_header(int fd, struct migrate_meta *m);
-int migrate_target_pre(struct ctx *c, struct migrate_meta *m);
-int migrate_target(int fd, const struct migrate_meta *m);
-void migrate_target_post(struct ctx *c, struct migrate_meta *m);
+int migrate_source(struct ctx *c, int fd);
+int migrate_target(struct ctx *c, int fd);
#endif /* MIGRATE_H */
diff --git a/vu_common.c b/vu_common.c
index 6c346c8..4797ef9 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -306,68 +306,6 @@ err:
return -1;
}
-/**
- * vu_migrate_source() - Migration as source, send state to hypervisor
- * @c: Execution context
- * @fd: File descriptor for state transfer
- *
- * Return: 0 on success, positive error code on failure
- */
-static int vu_migrate_source(struct ctx *c, int fd)
-{
- struct migrate_meta m;
- int rc;
-
- if ((rc = migrate_source_pre(c, &m))) {
- err("Source pre-migration failed: %s, abort", strerror_(rc));
- return rc;
- }
-
- debug("Saving backend state");
-
- rc = migrate_source(fd, &m);
- if (rc)
- err("Source migration failed: %s", strerror_(rc));
- else
- migrate_source_post(c, &m);
-
- return rc;
-}
-
-/**
- * vu_migrate_target() - Migration as target, receive state from hypervisor
- * @c: Execution context
- * @fd: File descriptor for state transfer
- *
- * Return: 0 on success, positive error code on failure
- */
-static int vu_migrate_target(struct ctx *c, int fd)
-{
- struct migrate_meta m;
- int rc;
-
- rc = migrate_target_read_header(fd, &m);
- if (rc) {
- err("Migration header check failed: %s, abort", strerror_(rc));
- return rc;
- }
-
- if ((rc = migrate_target_pre(c, &m))) {
- err("Target pre-migration failed: %s, abort", strerror_(rc));
- return rc;
- }
-
- debug("Loading backend state");
-
- rc = migrate_target(fd, &m);
- if (rc)
- err("Target migration failed: %s", strerror_(rc));
- else
- migrate_target_post(c, &m);
-
- return rc;
-}
-
/**
* vu_migrate() - Send/receive passt internal state to/from QEMU
* @c: Execution context
@@ -381,9 +319,9 @@ void vu_migrate(struct ctx *c, uint32_t events)
debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
if (events & EPOLLOUT)
- rc = vu_migrate_source(c, vdev->device_state_fd);
+ rc = migrate_source(c, vdev->device_state_fd);
else if (events & EPOLLIN)
- rc = vu_migrate_target(c, vdev->device_state_fd);
+ rc = migrate_target(c, vdev->device_state_fd);
/* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */
--
@@ -306,68 +306,6 @@ err:
return -1;
}
-/**
- * vu_migrate_source() - Migration as source, send state to hypervisor
- * @c: Execution context
- * @fd: File descriptor for state transfer
- *
- * Return: 0 on success, positive error code on failure
- */
-static int vu_migrate_source(struct ctx *c, int fd)
-{
- struct migrate_meta m;
- int rc;
-
- if ((rc = migrate_source_pre(c, &m))) {
- err("Source pre-migration failed: %s, abort", strerror_(rc));
- return rc;
- }
-
- debug("Saving backend state");
-
- rc = migrate_source(fd, &m);
- if (rc)
- err("Source migration failed: %s", strerror_(rc));
- else
- migrate_source_post(c, &m);
-
- return rc;
-}
-
-/**
- * vu_migrate_target() - Migration as target, receive state from hypervisor
- * @c: Execution context
- * @fd: File descriptor for state transfer
- *
- * Return: 0 on success, positive error code on failure
- */
-static int vu_migrate_target(struct ctx *c, int fd)
-{
- struct migrate_meta m;
- int rc;
-
- rc = migrate_target_read_header(fd, &m);
- if (rc) {
- err("Migration header check failed: %s, abort", strerror_(rc));
- return rc;
- }
-
- if ((rc = migrate_target_pre(c, &m))) {
- err("Target pre-migration failed: %s, abort", strerror_(rc));
- return rc;
- }
-
- debug("Loading backend state");
-
- rc = migrate_target(fd, &m);
- if (rc)
- err("Target migration failed: %s", strerror_(rc));
- else
- migrate_target_post(c, &m);
-
- return rc;
-}
-
/**
* vu_migrate() - Send/receive passt internal state to/from QEMU
* @c: Execution context
@@ -381,9 +319,9 @@ void vu_migrate(struct ctx *c, uint32_t events)
debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
if (events & EPOLLOUT)
- rc = vu_migrate_source(c, vdev->device_state_fd);
+ rc = migrate_source(c, vdev->device_state_fd);
else if (events & EPOLLIN)
- rc = vu_migrate_target(c, vdev->device_state_fd);
+ rc = migrate_target(c, vdev->device_state_fd);
/* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 12/20] migrate: Move repair_sock_init() to vu_init()
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (10 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 11/20] migrate: vu_migrate_{source,target}() aren't actually vu speciic Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 13/20] migrate: Make more handling common rather than vhost-user specific Stefano Brivio
` (7 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
From: David Gibson <david@gibson.dropbear.id.au>
Currently we call repair_sock_init() immediately before
tap_sock_unix_init(). However, this means it will be skipped if the
vhost-user control fd is passed with --fd instead of being created at a
specific path. We still need the repair socket in that case.
Move it, instead, to vu_init(), which has the added advantage of moving
all migration related one-time init to the same place.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
tap.c | 3 ---
vhost_user.c | 3 +++
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/tap.c b/tap.c
index 3659aab..d1a9f52 100644
--- a/tap.c
+++ b/tap.c
@@ -56,7 +56,6 @@
#include "netlink.h"
#include "pasta.h"
#include "packet.h"
-#include "repair.h"
#include "tap.h"
#include "log.h"
#include "vhost_user.h"
@@ -1362,8 +1361,6 @@ void tap_backend_init(struct ctx *c)
tap_sock_tun_init(c);
break;
case MODE_VU:
- repair_sock_init(c);
- /* fall through */
case MODE_PASST:
tap_sock_unix_init(c);
diff --git a/vhost_user.c b/vhost_user.c
index bbbf504..5df29c4 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -44,6 +44,7 @@
#include "tap.h"
#include "vhost_user.h"
#include "pcap.h"
+#include "repair.h"
/* vhost-user version we are compatible with */
#define VHOST_USER_VERSION 1
@@ -1106,8 +1107,10 @@ void vu_init(struct ctx *c)
}
c->vdev->log_table = NULL;
c->vdev->log_call_fd = -1;
+
c->vdev->device_state_fd = -1;
c->vdev->device_state_result = -1;
+ repair_sock_init(c);
}
--
@@ -44,6 +44,7 @@
#include "tap.h"
#include "vhost_user.h"
#include "pcap.h"
+#include "repair.h"
/* vhost-user version we are compatible with */
#define VHOST_USER_VERSION 1
@@ -1106,8 +1107,10 @@ void vu_init(struct ctx *c)
}
c->vdev->log_table = NULL;
c->vdev->log_call_fd = -1;
+
c->vdev->device_state_fd = -1;
c->vdev->device_state_result = -1;
+ repair_sock_init(c);
}
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 13/20] migrate: Make more handling common rather than vhost-user specific
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (11 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 12/20] migrate: Move repair_sock_init() to vu_init() Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 14/20] migrate: Don't handle the migration channel through epoll Stefano Brivio
` (6 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
From: David Gibson <david@gibson.dropbear.id.au>
A lot of the migration logic in vhost_user.c and vu_common.c isn't really
specific to vhost-user, but matches the overall structure of migration.
This applies to vu_migrate() and to the parts of of
vu_set_device_state_fd_exec() which aren't related to parsing the specific
vhost-user control request.
Move this logic to migrate.c, with matching renames.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
epoll_type.h | 4 +--
migrate.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++--
migrate.h | 6 ++--
passt.c | 6 ++--
passt.h | 6 ++++
vhost_user.c | 59 ++++-----------------------------
virtio.h | 4 ---
vu_common.c | 27 ---------------
vu_common.h | 2 +-
9 files changed, 112 insertions(+), 94 deletions(-)
diff --git a/epoll_type.h b/epoll_type.h
index 706238a..b981d30 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -40,8 +40,8 @@ enum epoll_type {
EPOLL_TYPE_VHOST_CMD,
/* vhost-user kick event socket */
EPOLL_TYPE_VHOST_KICK,
- /* vhost-user migration socket */
- EPOLL_TYPE_VHOST_MIGRATION,
+ /* migration device state channel */
+ EPOLL_TYPE_DEVICE_STATE,
/* TCP_REPAIR helper listening socket */
EPOLL_TYPE_REPAIR_LISTEN,
/* TCP_REPAIR helper socket */
diff --git a/migrate.c b/migrate.c
index 0e60475..fc6a043 100644
--- a/migrate.c
+++ b/migrate.c
@@ -21,6 +21,7 @@
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
+#include "repair.h"
#include "migrate.h"
@@ -160,7 +161,7 @@ static void migrate_source_post(struct ctx *c, struct migrate_meta *m)
*
* Return: 0 on success, positive error code on failure
*/
-int migrate_source(struct ctx *c, int fd)
+static int migrate_source(struct ctx *c, int fd)
{
struct migrate_meta m;
int rc;
@@ -305,7 +306,7 @@ static void migrate_target_post(struct ctx *c, struct migrate_meta *m)
*
* Return: 0 on success, positive error code on failure
*/
-int migrate_target(struct ctx *c, int fd)
+static int migrate_target(struct ctx *c, int fd)
{
struct migrate_meta m;
int rc;
@@ -331,3 +332,90 @@ int migrate_target(struct ctx *c, int fd)
return rc;
}
+
+/**
+ * set_migration_watch() - Add the migration file descriptor to epoll
+ * @c: Execution context
+ * @fd: File descriptor to add
+ * @target: Are we the target of the migration?
+ */
+static void set_migration_watch(const struct ctx *c, int fd, bool target)
+{
+ union epoll_ref ref = {
+ .type = EPOLL_TYPE_DEVICE_STATE,
+ .fd = fd,
+ };
+ struct epoll_event ev = { 0 };
+
+ ev.data.u64 = ref.u64;
+ ev.events = target ? EPOLLIN : EPOLLOUT;
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
+}
+
+/**
+ * migrate_init() - Set up things necessary for migration
+ * @c: Execution context
+ */
+void migrate_init(struct ctx *c)
+{
+ c->device_state_fd = -1;
+ c->device_state_result = -1;
+ repair_sock_init(c);
+}
+
+/**
+ * migrate_close() - Close migration channel
+ * @c: Execution context
+ */
+void migrate_close(struct ctx *c)
+{
+ if (c->device_state_fd != -1) {
+ debug("Closing migration channel, fd: %d", c->device_state_fd);
+ epoll_del(c, c->device_state_fd);
+ close(c->device_state_fd);
+ c->device_state_fd = -1;
+ c->device_state_result = -1;
+ }
+}
+
+/**
+ * migrate_request() - Request a migration of device state
+ * @c: Execution context
+ * @fd: fd to transfer state
+ * @target: Are we the target of the migration?
+ */
+void migrate_request(struct ctx *c, int fd, bool target)
+{
+ debug("Migration requested, fd: %d", c->device_state_fd);
+
+ if (c->device_state_fd != -1)
+ migrate_close(c);
+
+ c->device_state_fd = fd;
+ set_migration_watch(c, c->device_state_fd, target);
+
+}
+
+/**
+ * migrate_handler() - Send/receive passt internal state to/from QEMU
+ * @c: Execution context
+ * @events: epoll events
+ */
+void migrate_handler(struct ctx *c, uint32_t events)
+{
+ int rc = EIO;
+
+ debug("migrate_handler fd %d events %x", c->device_state_fd, events);
+
+ if (events & EPOLLOUT)
+ rc = migrate_source(c, c->device_state_fd);
+ else if (events & EPOLLIN)
+ rc = migrate_target(c, c->device_state_fd);
+
+ /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */
+
+ migrate_close(c);
+
+ c->device_state_result = rc;
+}
diff --git a/migrate.h b/migrate.h
index 21de70d..a222c48 100644
--- a/migrate.h
+++ b/migrate.h
@@ -76,7 +76,9 @@ struct migrate_target_handlers {
struct migrate_handler *post;
};
-int migrate_source(struct ctx *c, int fd);
-int migrate_target(struct ctx *c, int fd);
+void migrate_init(struct ctx *c);
+void migrate_close(struct ctx *c);
+void migrate_request(struct ctx *c, int fd, bool target);
+void migrate_handler(struct ctx *c, uint32_t events);
#endif /* MIGRATE_H */
diff --git a/passt.c b/passt.c
index 1fa2ddd..3c3a331 100644
--- a/passt.c
+++ b/passt.c
@@ -76,7 +76,7 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
- [EPOLL_TYPE_VHOST_MIGRATION] = "vhost-user migration socket",
+ [EPOLL_TYPE_DEVICE_STATE] = "migration device state channel",
[EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
[EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
};
@@ -360,8 +360,8 @@ loop:
case EPOLL_TYPE_VHOST_KICK:
vu_kick_cb(c.vdev, ref, &now);
break;
- case EPOLL_TYPE_VHOST_MIGRATION:
- vu_migrate(&c, eventmask);
+ case EPOLL_TYPE_DEVICE_STATE:
+ migrate_handler(&c, eventmask);
break;
case EPOLL_TYPE_REPAIR_LISTEN:
repair_listen_handler(&c, eventmask);
diff --git a/passt.h b/passt.h
index 85b0a10..5992cbe 100644
--- a/passt.h
+++ b/passt.h
@@ -239,6 +239,8 @@ struct ip6_ctx {
* @low_wmem: Low probed net.core.wmem_max
* @low_rmem: Low probed net.core.rmem_max
* @vdev: vhost-user device
+ * @device_state_fd: Device state migration channel
+ * @device_state_result: Device state migration result
*/
struct ctx {
enum passt_modes mode;
@@ -307,6 +309,10 @@ struct ctx {
int low_rmem;
struct vu_dev *vdev;
+
+ /* Migration */
+ int device_state_fd;
+ int device_state_result;
};
void proto_update_l2_buf(const unsigned char *eth_d,
diff --git a/vhost_user.c b/vhost_user.c
index 5df29c4..2dde405 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -44,7 +44,6 @@
#include "tap.h"
#include "vhost_user.h"
#include "pcap.h"
-#include "repair.h"
/* vhost-user version we are compatible with */
#define VHOST_USER_VERSION 1
@@ -998,36 +997,6 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
return false;
}
-/**
- * vu_set_migration_watch() - Add the migration file descriptor to epoll
- * @vdev: vhost-user device
- * @fd: File descriptor to add
- * @direction: Direction of the migration (save or load backend state)
- */
-static void vu_set_migration_watch(const struct vu_dev *vdev, int fd,
- uint32_t direction)
-{
- union epoll_ref ref = {
- .type = EPOLL_TYPE_VHOST_MIGRATION,
- .fd = fd,
- };
- struct epoll_event ev = { 0 };
-
- ev.data.u64 = ref.u64;
- switch (direction) {
- case VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE:
- ev.events = EPOLLOUT;
- break;
- case VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD:
- ev.events = EPOLLIN;
- break;
- default:
- ASSERT(0);
- }
-
- epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
-}
-
/**
* vu_set_device_state_fd_exec() - Set the device state migration channel
* @vdev: vhost-user device
@@ -1052,16 +1021,8 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
direction != VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD)
die("Invalide device_state_fd direction: %d", direction);
- if (vdev->device_state_fd != -1) {
- epoll_del(vdev->context, vdev->device_state_fd);
- close(vdev->device_state_fd);
- }
-
- vdev->device_state_fd = msg->fds[0];
- vdev->device_state_result = -1;
- vu_set_migration_watch(vdev, vdev->device_state_fd, direction);
-
- debug("Got device_state_fd: %d", vdev->device_state_fd);
+ migrate_request(vdev->context, msg->fds[0],
+ direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
/* We don't provide a new fd for the data transfer */
vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
@@ -1079,9 +1040,7 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
static bool vu_check_device_state_exec(struct vu_dev *vdev,
struct vhost_user_msg *msg)
{
- (void)vdev;
-
- vmsg_set_reply_u64(msg, vdev->device_state_result);
+ vmsg_set_reply_u64(msg, vdev->context->device_state_result);
return true;
}
@@ -1108,9 +1067,7 @@ void vu_init(struct ctx *c)
c->vdev->log_table = NULL;
c->vdev->log_call_fd = -1;
- c->vdev->device_state_fd = -1;
- c->vdev->device_state_result = -1;
- repair_sock_init(c);
+ migrate_init(c);
}
@@ -1160,12 +1117,8 @@ void vu_cleanup(struct vu_dev *vdev)
vu_close_log(vdev);
- if (vdev->device_state_fd != -1) {
- epoll_del(vdev->context, vdev->device_state_fd);
- close(vdev->device_state_fd);
- vdev->device_state_fd = -1;
- vdev->device_state_result = -1;
- }
+ /* If we lose the VU dev, we also lose our migration channel */
+ migrate_close(vdev->context);
}
/**
diff --git a/virtio.h b/virtio.h
index 7bef2d2..0a59441 100644
--- a/virtio.h
+++ b/virtio.h
@@ -106,8 +106,6 @@ struct vu_dev_region {
* @log_call_fd: Eventfd to report logging update
* @log_size: Size of the logging memory region
* @log_table: Base of the logging memory region
- * @device_state_fd: Device state migration channel
- * @device_state_result: Device state migration result
*/
struct vu_dev {
struct ctx *context;
@@ -119,8 +117,6 @@ struct vu_dev {
int log_call_fd;
uint64_t log_size;
uint8_t *log_table;
- int device_state_fd;
- int device_state_result;
};
/**
diff --git a/vu_common.c b/vu_common.c
index 4797ef9..78d1c1b 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -305,30 +305,3 @@ err:
return -1;
}
-
-/**
- * vu_migrate() - Send/receive passt internal state to/from QEMU
- * @c: Execution context
- * @events: epoll events
- */
-void vu_migrate(struct ctx *c, uint32_t events)
-{
- struct vu_dev *vdev = c->vdev;
- int rc = EIO;
-
- debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
-
- if (events & EPOLLOUT)
- rc = migrate_source(c, vdev->device_state_fd);
- else if (events & EPOLLIN)
- rc = migrate_target(c, vdev->device_state_fd);
-
- /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */
-
- vdev->device_state_result = rc;
-
- epoll_ctl(c->epollfd, EPOLL_CTL_DEL, vdev->device_state_fd, NULL);
- debug("Closing migration channel");
- close(vdev->device_state_fd);
- vdev->device_state_fd = -1;
-}
diff --git a/vu_common.h b/vu_common.h
index 69c4006..f538f23 100644
--- a/vu_common.h
+++ b/vu_common.h
@@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now);
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
-void vu_migrate(struct ctx *c, uint32_t events);
+
#endif /* VU_COMMON_H */
--
@@ -57,5 +57,5 @@ void vu_flush(const struct vu_dev *vdev, struct vu_virtq *vq,
void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref,
const struct timespec *now);
int vu_send_single(const struct ctx *c, const void *buf, size_t size);
-void vu_migrate(struct ctx *c, uint32_t events);
+
#endif /* VU_COMMON_H */
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 14/20] migrate: Don't handle the migration channel through epoll
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (12 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 13/20] migrate: Make more handling common rather than vhost-user specific Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 15/20] flow, flow_table: Export declaration of hash table Stefano Brivio
` (5 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
From: David Gibson <david@gibson.dropbear.id.au>
Currently, once a migration device state fd is assigned, we wait for
EPOLLIN or EPOLLOUT events on it to actually perform the migration. Change
it so that once a migration is requested it we complete it synchronously
at the end of the current epoll cycle. This has several advantages:
1. It makes it clear that everything about the migration must be dealt
with at once, not split between multiple epoll events on the channel
2. It ensures the migration always takes place between epoll cycles,
rather than, for example, between handling TCP events and their
deferred handling in post_handler().
3. It reduces code setting up the epoll watch on the fd.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
epoll_type.h | 2 --
migrate.c | 44 +++++++++++---------------------------------
migrate.h | 2 +-
passt.c | 6 ++----
passt.h | 2 ++
vu_common.c | 27 +++++++++++++++++++++++++++
6 files changed, 43 insertions(+), 40 deletions(-)
diff --git a/epoll_type.h b/epoll_type.h
index b981d30..7f2a121 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -40,8 +40,6 @@ enum epoll_type {
EPOLL_TYPE_VHOST_CMD,
/* vhost-user kick event socket */
EPOLL_TYPE_VHOST_KICK,
- /* migration device state channel */
- EPOLL_TYPE_DEVICE_STATE,
/* TCP_REPAIR helper listening socket */
EPOLL_TYPE_REPAIR_LISTEN,
/* TCP_REPAIR helper socket */
diff --git a/migrate.c b/migrate.c
index fc6a043..faa7841 100644
--- a/migrate.c
+++ b/migrate.c
@@ -50,7 +50,6 @@ static union migrate_header header = {
/* Data sections for version 1 */
static struct iovec sections_v1[] = {
- { &header, sizeof(header) },
};
/* Set of data versions */
@@ -333,26 +332,6 @@ static int migrate_target(struct ctx *c, int fd)
return rc;
}
-/**
- * set_migration_watch() - Add the migration file descriptor to epoll
- * @c: Execution context
- * @fd: File descriptor to add
- * @target: Are we the target of the migration?
- */
-static void set_migration_watch(const struct ctx *c, int fd, bool target)
-{
- union epoll_ref ref = {
- .type = EPOLL_TYPE_DEVICE_STATE,
- .fd = fd,
- };
- struct epoll_event ev = { 0 };
-
- ev.data.u64 = ref.u64;
- ev.events = target ? EPOLLIN : EPOLLOUT;
-
- epoll_ctl(c->epollfd, EPOLL_CTL_ADD, ref.fd, &ev);
-}
-
/**
* migrate_init() - Set up things necessary for migration
* @c: Execution context
@@ -372,7 +351,6 @@ void migrate_close(struct ctx *c)
{
if (c->device_state_fd != -1) {
debug("Closing migration channel, fd: %d", c->device_state_fd);
- epoll_del(c, c->device_state_fd);
close(c->device_state_fd);
c->device_state_fd = -1;
c->device_state_result = -1;
@@ -393,27 +371,27 @@ void migrate_request(struct ctx *c, int fd, bool target)
migrate_close(c);
c->device_state_fd = fd;
- set_migration_watch(c, c->device_state_fd, target);
-
+ c->migrate_target = target;
}
/**
* migrate_handler() - Send/receive passt internal state to/from QEMU
* @c: Execution context
- * @events: epoll events
*/
-void migrate_handler(struct ctx *c, uint32_t events)
+void migrate_handler(struct ctx *c)
{
- int rc = EIO;
+ int rc;
- debug("migrate_handler fd %d events %x", c->device_state_fd, events);
+ if (c->device_state_fd < 0)
+ return;
- if (events & EPOLLOUT)
- rc = migrate_source(c, c->device_state_fd);
- else if (events & EPOLLIN)
- rc = migrate_target(c, c->device_state_fd);
+ debug("migrate_handler fd %d target %d",
+ c->device_state_fd, c->migrate_target);
- /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */
+ if (c->migrate_target)
+ rc = migrate_target(c, c->device_state_fd);
+ else
+ rc = migrate_source(c, c->device_state_fd);
migrate_close(c);
diff --git a/migrate.h b/migrate.h
index a222c48..158241f 100644
--- a/migrate.h
+++ b/migrate.h
@@ -79,6 +79,6 @@ struct migrate_target_handlers {
void migrate_init(struct ctx *c);
void migrate_close(struct ctx *c);
void migrate_request(struct ctx *c, int fd, bool target);
-void migrate_handler(struct ctx *c, uint32_t events);
+void migrate_handler(struct ctx *c);
#endif /* MIGRATE_H */
diff --git a/passt.c b/passt.c
index 3c3a331..1938290 100644
--- a/passt.c
+++ b/passt.c
@@ -76,7 +76,6 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket",
[EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket",
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
- [EPOLL_TYPE_DEVICE_STATE] = "migration device state channel",
[EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
[EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
};
@@ -360,9 +359,6 @@ loop:
case EPOLL_TYPE_VHOST_KICK:
vu_kick_cb(c.vdev, ref, &now);
break;
- case EPOLL_TYPE_DEVICE_STATE:
- migrate_handler(&c, eventmask);
- break;
case EPOLL_TYPE_REPAIR_LISTEN:
repair_listen_handler(&c, eventmask);
break;
@@ -377,5 +373,7 @@ loop:
post_handler(&c, &now);
+ migrate_handler(&c);
+
goto loop;
}
diff --git a/passt.h b/passt.h
index 5992cbe..4189a4a 100644
--- a/passt.h
+++ b/passt.h
@@ -241,6 +241,7 @@ struct ip6_ctx {
* @vdev: vhost-user device
* @device_state_fd: Device state migration channel
* @device_state_result: Device state migration result
+ * @migrate_target: Is this the target for next migration?
*/
struct ctx {
enum passt_modes mode;
@@ -313,6 +314,7 @@ struct ctx {
/* Migration */
int device_state_fd;
int device_state_result;
+ bool migrate_target;
};
void proto_update_l2_buf(const unsigned char *eth_d,
diff --git a/vu_common.c b/vu_common.c
index 78d1c1b..4797ef9 100644
--- a/vu_common.c
+++ b/vu_common.c
@@ -305,3 +305,30 @@ err:
return -1;
}
+
+/**
+ * vu_migrate() - Send/receive passt internal state to/from QEMU
+ * @c: Execution context
+ * @events: epoll events
+ */
+void vu_migrate(struct ctx *c, uint32_t events)
+{
+ struct vu_dev *vdev = c->vdev;
+ int rc = EIO;
+
+ debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
+
+ if (events & EPOLLOUT)
+ rc = migrate_source(c, vdev->device_state_fd);
+ else if (events & EPOLLIN)
+ rc = migrate_target(c, vdev->device_state_fd);
+
+ /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */
+
+ vdev->device_state_result = rc;
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, vdev->device_state_fd, NULL);
+ debug("Closing migration channel");
+ close(vdev->device_state_fd);
+ vdev->device_state_fd = -1;
+}
--
@@ -305,3 +305,30 @@ err:
return -1;
}
+
+/**
+ * vu_migrate() - Send/receive passt internal state to/from QEMU
+ * @c: Execution context
+ * @events: epoll events
+ */
+void vu_migrate(struct ctx *c, uint32_t events)
+{
+ struct vu_dev *vdev = c->vdev;
+ int rc = EIO;
+
+ debug("vu_migrate fd %d events %x", vdev->device_state_fd, events);
+
+ if (events & EPOLLOUT)
+ rc = migrate_source(c, vdev->device_state_fd);
+ else if (events & EPOLLIN)
+ rc = migrate_target(c, vdev->device_state_fd);
+
+ /* EPOLLHUP without EPOLLIN/EPOLLOUT, or EPOLLERR? Migration failed */
+
+ vdev->device_state_result = rc;
+
+ epoll_ctl(c->epollfd, EPOLL_CTL_DEL, vdev->device_state_fd, NULL);
+ debug("Closing migration channel");
+ close(vdev->device_state_fd);
+ vdev->device_state_fd = -1;
+}
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 15/20] flow, flow_table: Export declaration of hash table
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (13 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 14/20] migrate: Don't handle the migration channel through epoll Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 16/20] vhost_user: Turn vhost-user message reports to trace() Stefano Brivio
` (4 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
We need this to transfer it, so we can throw away this change soon,
I guess.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
flow.c | 6 +-----
flow_table.h | 3 +++
2 files changed, 4 insertions(+), 5 deletions(-)
diff --git a/flow.c b/flow.c
index e7148b2..5638ff1 100644
--- a/flow.c
+++ b/flow.c
@@ -110,12 +110,8 @@ unsigned flow_first_free;
union flow flowtab[FLOW_MAX];
static const union flow *flow_new_entry; /* = NULL */
-/* Hash table to index it */
-#define FLOW_HASH_LOAD 70 /* % */
-#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD))
-
/* Table for lookup from flowside information */
-static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];
+flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];
static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX,
"Safe linear probing requires hash table with more entries than the number of sides in the flow table");
diff --git a/flow_table.h b/flow_table.h
index a85cab5..633805d 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -49,6 +49,9 @@ static_assert(sizeof(union flow) == 128, "union flow should be 128-byte wide");
/* Global Flow Table */
extern unsigned flow_first_free;
extern union flow flowtab[FLOW_MAX];
+#define FLOW_HASH_LOAD 70 /* % */
+#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD))
+extern flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];
/**
* flow_foreach_sidei() - 'for' type macro to step through each side of flow
--
@@ -49,6 +49,9 @@ static_assert(sizeof(union flow) == 128, "union flow should be 128-byte wide");
/* Global Flow Table */
extern unsigned flow_first_free;
extern union flow flowtab[FLOW_MAX];
+#define FLOW_HASH_LOAD 70 /* % */
+#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD))
+extern flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];
/**
* flow_foreach_sidei() - 'for' type macro to step through each side of flow
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 16/20] vhost_user: Turn vhost-user message reports to trace()
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (14 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 15/20] flow, flow_table: Export declaration of hash table Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 17/20] vhost_user: Make source quit after reporting migration state Stefano Brivio
` (3 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
Having every vhost-user message printed as part of debug output makes
debugging anything else a bit complicated.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
vhost_user.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/vhost_user.c b/vhost_user.c
index 2dde405..1092387 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -640,8 +640,8 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev,
unsigned int idx = msg->payload.state.index;
unsigned int num = msg->payload.state.num;
- debug("State.index: %u", idx);
- debug("State.num: %u", num);
+ trace("State.index: %u", idx);
+ trace("State.num: %u", num);
vdev->vq[idx].vring.num = num;
return false;
@@ -1176,11 +1176,11 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
vu_sock_reset(vdev);
return;
}
- debug("================ Vhost user message ================");
- debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
+ trace("================ Vhost user message ================");
+ trace("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
msg.hdr.request);
- debug("Flags: 0x%x", msg.hdr.flags);
- debug("Size: %u", msg.hdr.size);
+ trace("Flags: 0x%x", msg.hdr.flags);
+ trace("Size: %u", msg.hdr.size);
need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
--
@@ -640,8 +640,8 @@ static bool vu_set_vring_num_exec(struct vu_dev *vdev,
unsigned int idx = msg->payload.state.index;
unsigned int num = msg->payload.state.num;
- debug("State.index: %u", idx);
- debug("State.num: %u", num);
+ trace("State.index: %u", idx);
+ trace("State.num: %u", num);
vdev->vq[idx].vring.num = num;
return false;
@@ -1176,11 +1176,11 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
vu_sock_reset(vdev);
return;
}
- debug("================ Vhost user message ================");
- debug("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
+ trace("================ Vhost user message ================");
+ trace("Request: %s (%d)", vu_request_to_string(msg.hdr.request),
msg.hdr.request);
- debug("Flags: 0x%x", msg.hdr.flags);
- debug("Size: %u", msg.hdr.size);
+ trace("Flags: 0x%x", msg.hdr.flags);
+ trace("Size: %u", msg.hdr.size);
need_reply = msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK;
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 17/20] vhost_user: Make source quit after reporting migration state
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (15 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 16/20] vhost_user: Turn vhost-user message reports to trace() Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 18/20] tcp: Get our socket port using getsockname() when connecting from guest Stefano Brivio
` (2 subsequent siblings)
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
On migration, the source process asks passt-helper to set TCP sockets
in repair mode, dumps the information we need to migrate connections,
and closes them.
At this point, we can't pass them back to passt-helper using
SCM_RIGHTS, because they are closed, from that perspective, and
sendmsg() will give us EBADF. But if we don't clear repair mode, the
port they are bound to will not be available for binding in the
target.
Terminate once we're done with the migration and we reported the
state. This is equivalent to clearing repair mode on the sockets we
just closed.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
vhost_user.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/vhost_user.c b/vhost_user.c
index 1092387..19ede8a 100644
--- a/vhost_user.c
+++ b/vhost_user.c
@@ -997,6 +997,8 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
return false;
}
+static bool quit_on_device_state = false;
+
/**
* vu_set_device_state_fd_exec() - Set the device state migration channel
* @vdev: vhost-user device
@@ -1024,6 +1026,9 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
migrate_request(vdev->context, msg->fds[0],
direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
+ if (direction == VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE)
+ quit_on_device_state = true;
+
/* We don't provide a new fd for the data transfer */
vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
@@ -1201,4 +1206,10 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
if (reply_requested)
vu_send_reply(fd, &msg);
+
+ if (quit_on_device_state &&
+ msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE) {
+ info("Migration complete, exiting");
+ exit(EXIT_SUCCESS);
+ }
}
--
@@ -997,6 +997,8 @@ static bool vu_send_rarp_exec(struct vu_dev *vdev,
return false;
}
+static bool quit_on_device_state = false;
+
/**
* vu_set_device_state_fd_exec() - Set the device state migration channel
* @vdev: vhost-user device
@@ -1024,6 +1026,9 @@ static bool vu_set_device_state_fd_exec(struct vu_dev *vdev,
migrate_request(vdev->context, msg->fds[0],
direction == VHOST_USER_TRANSFER_STATE_DIRECTION_LOAD);
+ if (direction == VHOST_USER_TRANSFER_STATE_DIRECTION_SAVE)
+ quit_on_device_state = true;
+
/* We don't provide a new fd for the data transfer */
vmsg_set_reply_u64(msg, VHOST_USER_VRING_NOFD_MASK);
@@ -1201,4 +1206,10 @@ void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events)
if (reply_requested)
vu_send_reply(fd, &msg);
+
+ if (quit_on_device_state &&
+ msg.hdr.request == VHOST_USER_CHECK_DEVICE_STATE) {
+ info("Migration complete, exiting");
+ exit(EXIT_SUCCESS);
+ }
}
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 18/20] tcp: Get our socket port using getsockname() when connecting from guest
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (16 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 17/20] vhost_user: Make source quit after reporting migration state Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 19/20] tcp: Add HOSTSIDE(x), HOSTFLOW(x) macros Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 20/20] Implement target side of migration Stefano Brivio
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
For migration only: we need to store 'oport', our socket-side port,
as we establish a connection from the guest, so that we can bind the
same oport as source port in the migration target.
Use getsockname() to fetch that.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
flow.c | 4 ++--
flow_table.h | 4 ++--
tcp.c | 24 +++++++++++++++++++++++-
3 files changed, 27 insertions(+), 5 deletions(-)
diff --git a/flow.c b/flow.c
index 5638ff1..506cbac 100644
--- a/flow.c
+++ b/flow.c
@@ -411,8 +411,8 @@ const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
*
* Return: pointer to the target flowside information
*/
-const struct flowside *flow_target(const struct ctx *c, union flow *flow,
- uint8_t proto)
+struct flowside *flow_target(const struct ctx *c, union flow *flow,
+ uint8_t proto)
{
char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN];
struct flow_common *f = &flow->f;
diff --git a/flow_table.h b/flow_table.h
index 633805d..b107107 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -178,8 +178,8 @@ const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
sa_family_t af,
const void *saddr, in_port_t sport,
const void *daddr, in_port_t dport);
-const struct flowside *flow_target(const struct ctx *c, union flow *flow,
- uint8_t proto);
+struct flowside *flow_target(const struct ctx *c, union flow *flow,
+ uint8_t proto);
union flow *flow_set_type(union flow *flow, enum flow_type type);
#define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_)
diff --git a/tcp.c b/tcp.c
index 0bd2a02..4fd405b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1471,6 +1471,8 @@ static void tcp_bind_outbound(const struct ctx *c,
* @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
+ *
+ * #syscalls:vu getsockname
*/
static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
const void *saddr, const void *daddr,
@@ -1479,9 +1481,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
{
in_port_t srcport = ntohs(th->source);
in_port_t dstport = ntohs(th->dest);
- const struct flowside *ini, *tgt;
+ const struct flowside *ini;
struct tcp_tap_conn *conn;
union sockaddr_inany sa;
+ struct flowside *tgt;
union flow *flow;
int s = -1, mss;
uint64_t hash;
@@ -1586,6 +1589,25 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
}
tcp_epoll_ctl(c, conn);
+
+ if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
+ if (af == AF_INET) {
+ struct sockaddr_in s_in;
+ socklen_t sl;
+
+ sl = sizeof(s_in);
+ getsockname(s, (struct sockaddr *)&s_in, &sl);
+ tgt->oport = ntohs(s_in.sin_port);
+ } else {
+ struct sockaddr_in6 s_in6;
+ socklen_t sl;
+
+ sl = sizeof(s_in6);
+ getsockname(s, (struct sockaddr *)&s_in6, &sl);
+ tgt->oport = ntohs(s_in6.sin6_port);
+ }
+ }
+
FLOW_ACTIVATE(conn);
return;
--
@@ -1471,6 +1471,8 @@ static void tcp_bind_outbound(const struct ctx *c,
* @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
+ *
+ * #syscalls:vu getsockname
*/
static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
const void *saddr, const void *daddr,
@@ -1479,9 +1481,10 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
{
in_port_t srcport = ntohs(th->source);
in_port_t dstport = ntohs(th->dest);
- const struct flowside *ini, *tgt;
+ const struct flowside *ini;
struct tcp_tap_conn *conn;
union sockaddr_inany sa;
+ struct flowside *tgt;
union flow *flow;
int s = -1, mss;
uint64_t hash;
@@ -1586,6 +1589,25 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
}
tcp_epoll_ctl(c, conn);
+
+ if (c->mode == MODE_VU) { /* To rebind to same oport after migration */
+ if (af == AF_INET) {
+ struct sockaddr_in s_in;
+ socklen_t sl;
+
+ sl = sizeof(s_in);
+ getsockname(s, (struct sockaddr *)&s_in, &sl);
+ tgt->oport = ntohs(s_in.sin_port);
+ } else {
+ struct sockaddr_in6 s_in6;
+ socklen_t sl;
+
+ sl = sizeof(s_in6);
+ getsockname(s, (struct sockaddr *)&s_in6, &sl);
+ tgt->oport = ntohs(s_in6.sin6_port);
+ }
+ }
+
FLOW_ACTIVATE(conn);
return;
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 19/20] tcp: Add HOSTSIDE(x), HOSTFLOW(x) macros
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (17 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 18/20] tcp: Get our socket port using getsockname() when connecting from guest Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
2025-01-31 19:39 ` [PATCH v3 20/20] Implement target side of migration Stefano Brivio
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
Those are symmetric to TAPSIDE(x)/TAPFLOW(x) and I'll use them in
the next patch to extract 'oport' in order to re-bind sockets to
the original socket-side local port.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
tcp_internal.h | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/tcp_internal.h b/tcp_internal.h
index 94e5780..9cf31f5 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -38,9 +38,13 @@
#define OPT_SACK 5
#define OPT_TS 8
-#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
-#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
-#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
+#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
+#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
+#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
+
+#define HOSTSIDE(conn_) ((conn_)->f.pif[1] == PIF_HOST)
+#define HOSTFLOW(conn_) (&((conn_)->f.side[HOSTSIDE(conn_)]))
+#define HOST_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr))
#define CONN_V6(conn) (!CONN_V4(conn))
--
@@ -38,9 +38,13 @@
#define OPT_SACK 5
#define OPT_TS 8
-#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
-#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
-#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
+#define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
+#define TAPFLOW(conn_) (&((conn_)->f.side[TAPSIDE(conn_)]))
+#define TAP_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
+
+#define HOSTSIDE(conn_) ((conn_)->f.pif[1] == PIF_HOST)
+#define HOSTFLOW(conn_) (&((conn_)->f.side[HOSTSIDE(conn_)]))
+#define HOST_SIDX(conn_) (FLOW_SIDX((conn_), TAPSIDE(conn_)))
#define CONN_V4(conn) (!!inany_v4(&TAPFLOW(conn)->oaddr))
#define CONN_V6(conn) (!CONN_V4(conn))
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v3 20/20] Implement target side of migration
2025-01-31 19:39 [PATCH v3 00/20] Draft, incomplete series introducing state migration Stefano Brivio
` (18 preceding siblings ...)
2025-01-31 19:39 ` [PATCH v3 19/20] tcp: Add HOSTSIDE(x), HOSTFLOW(x) macros Stefano Brivio
@ 2025-01-31 19:39 ` Stefano Brivio
19 siblings, 0 replies; 21+ messages in thread
From: Stefano Brivio @ 2025-01-31 19:39 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier, David Gibson
It's draft quality, with a number of hacks, and it will need a partial
rewrite. Add:
- flow_migrate_target_post(), to open target-side sockets and bind
them, switch them to repair mode, connect them, and make them leave
repair mode again
- copies of flow table, 'flow_first_free' pointer, related hash table,
and hash secret. The copy of the hash secret shows that the current
declarative approach to data sections has some drawbacks
Change tcp_flow_dump_seq() into tcp_flow_repair_seq(), which can dump
as well as restore sequences (used before connecting sockets).
Once we connect sockets, before we take them out of repair mode, we
need to restore MSS and window scaling information (what would be
determined by TCP options on handshake). I'm using hardcoded values as
we don't have a way to transfer these bits of socket-side information.
Before we turn repair mode off, add sockets to the epoll list and set
up per-socket timerfd descriptors, with initial timer scheduling.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
flow.c | 46 ++++++++++++++++++-
flow.h | 1 +
migrate.c | 9 ++++
passt.c | 4 ++
passt.h | 2 +
tcp.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++++----
tcp_conn.h | 4 +-
7 files changed, 187 insertions(+), 11 deletions(-)
diff --git a/flow.c b/flow.c
index 506cbac..8fcf8c4 100644
--- a/flow.c
+++ b/flow.c
@@ -907,12 +907,56 @@ int flow_migrate_source_pre(struct ctx *c, struct migrate_meta *m)
i += flow->free.n - 1;
else if (flow->f.state == FLOW_STATE_ACTIVE &&
flow->f.type == FLOW_TCP)
- tcp_flow_dump_seq(c, &flow->tcp);
+ tcp_flow_repair_seq(c, &flow->tcp, false);
}
return 0;
}
+/**
+ * flow_migrate_target_post() - Restore all flows after migration
+ * @c: Execution context
+ * @m: Migration metadata
+ *
+ * Return: 0 on success
+ */
+int flow_migrate_target_post(struct ctx *c, struct migrate_meta *m)
+{
+ unsigned i;
+ int rc;
+
+ (void)m;
+
+ for (i = 0; i < FLOW_MAX; i++) { /* TODO: iterator with skip */
+ union flow *flow = &flowtab[i];
+
+ if (flow->f.state == FLOW_STATE_FREE)
+ i += flow->free.n - 1;
+ else if (flow->f.state == FLOW_STATE_ACTIVE &&
+ flow->f.type == FLOW_TCP)
+ rc = tcp_flow_repair_socket(c, &flow->tcp);
+
+ if (rc)
+ return rc; /* TODO: rollback */
+ }
+
+ repair_flush(c); /* TODO: move to TCP logic */
+
+ for (i = 0; i < FLOW_MAX; i++) { /* TODO: iterator with skip */
+ union flow *flow = &flowtab[i];
+
+ if (flow->f.state == FLOW_STATE_FREE)
+ i += flow->free.n - 1;
+ else if (flow->f.state == FLOW_STATE_ACTIVE &&
+ flow->f.type == FLOW_TCP)
+ tcp_flow_repair_connect(c, &flow->tcp);
+ }
+
+ repair_flush(c); /* TODO: move to TCP logic */
+
+ return 0;
+}
+
/**
* flow_init() - Initialise flow related data structures
*/
diff --git a/flow.h b/flow.h
index ff390a6..43fb507 100644
--- a/flow.h
+++ b/flow.h
@@ -256,6 +256,7 @@ union flow;
void flow_init(void);
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
int flow_migrate_source_pre(struct ctx *c, struct migrate_meta *m);
+int flow_migrate_target_post(struct ctx *c, struct migrate_meta *m);
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
__attribute__((format(printf, 3, 4)));
diff --git a/migrate.c b/migrate.c
index faa7841..d47c44b 100644
--- a/migrate.c
+++ b/migrate.c
@@ -50,6 +50,12 @@ static union migrate_header header = {
/* Data sections for version 1 */
static struct iovec sections_v1[] = {
+ { &header, sizeof(header) },
+ { &flow_first_free, sizeof(flow_first_free) },
+ { flowtab, sizeof(flowtab) },
+ { flow_hashtab, sizeof(flow_hashtab) },
+ { g_hash_secret, sizeof(g_hash_secret) },
+ { 0 },
};
/* Set of data versions */
@@ -78,6 +84,7 @@ struct migrate_handler handlers_target_pre_v1[] = {
/* Handlers to call in target after receiving data with version 1 */
struct migrate_handler handlers_target_post_v1[] = {
+ { flow_migrate_target_post },
{ 0 },
};
@@ -292,6 +299,8 @@ static void migrate_target_post(struct ctx *c, struct migrate_meta *m)
struct migrate_target_handlers *th;
struct migrate_handler *h;
+ memcpy(c->hash_secret, g_hash_secret, sizeof(g_hash_secret));
+
for (th = target_handlers; th->v != m->v && th->v; th++);
for (h = th->post; h->fn; h++)
diff --git a/passt.c b/passt.c
index 1938290..65e9126 100644
--- a/passt.c
+++ b/passt.c
@@ -119,6 +119,8 @@ static void post_handler(struct ctx *c, const struct timespec *now)
ndp_timer(c, now);
}
+uint64_t g_hash_secret[2];
+
/**
* random_init() - Initialise things based on random data
* @c: Execution context
@@ -130,6 +132,8 @@ static void random_init(struct ctx *c)
/* Create secret value for SipHash calculations */
raw_random(&c->hash_secret, sizeof(c->hash_secret));
+ memcpy(g_hash_secret, c->hash_secret, sizeof(g_hash_secret));
+
/* Seed pseudo-RNG for things that need non-cryptographic random */
raw_random(&seed, sizeof(seed));
srandom(seed);
diff --git a/passt.h b/passt.h
index 4189a4a..6010f92 100644
--- a/passt.h
+++ b/passt.h
@@ -317,6 +317,8 @@ struct ctx {
bool migrate_target;
};
+extern uint64_t g_hash_secret[2];
+
void proto_update_l2_buf(const unsigned char *eth_d,
const unsigned char *eth_s);
diff --git a/tcp.c b/tcp.c
index 4fd405b..d45edaf 100644
--- a/tcp.c
+++ b/tcp.c
@@ -887,13 +887,31 @@ int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn)
}
/**
- * tcp_flow_dump_seq() - Dump sequences for send and receive queues
+ * tcp_flow_repair_off() - Clear repair mode for a single TCP flow
* @c: Execution context
* @conn: Pointer to the TCP connection structure
*
* Return: 0 on success, negative error code on failure
*/
-int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_flow_repair_off(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+ int rc = 0;
+
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+ err("Failed to clear TCP_REPAIR");
+
+ return rc;
+}
+
+/**
+ * tcp_flow_repair_seq() - Dump or set sequences for socket queues
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ * @set: Set if true, dump if false
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_seq(struct ctx *c, struct tcp_tap_conn *conn, bool set)
{
int v, s = conn->sock;
socklen_t vlen;
@@ -902,28 +920,124 @@ int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn)
vlen = sizeof(v);
- v = TCP_SEND_QUEUE;
/* TODO: proper error management and prints */
- if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
- return -errno;
- if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd, &vlen))
+ v = TCP_SEND_QUEUE;
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
return -errno;
- debug("Send queue sequence %u for socket %i", conn->sock_seq_snd, s);
+ if (set) {
+ if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd,
+ vlen))
+ return -errno;
+ debug("Set send queue sequence for socket %i to %u",
+ s, conn->sock_seq_snd);
+ } else {
+ if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_snd,
+ &vlen))
+ return -errno;
+ debug("Dumped send queue sequence for socket %i: %u",
+ s, conn->sock_seq_snd);
+ }
v = TCP_RECV_QUEUE;
if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, vlen))
return -errno;
- if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv, &vlen))
+ if (set) {
+ if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv,
+ vlen))
+ return -errno;
+ debug("Set receive queue sequence for socket %i to %u",
+ s, conn->sock_seq_rcv);
+ } else {
+ if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, &conn->sock_seq_rcv,
+ &vlen))
+ return -errno;
+ debug("Dumped receive queue sequence for socket %i: %u",
+ s, conn->sock_seq_rcv);
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+ const struct flowside *sockside = HOSTFLOW(conn);
+ struct sockaddr_in a;
+ int rc;
+
+ a = (struct sockaddr_in){ af, htons(sockside->oport), { 0 }, { 0 } };
+
+ if ((conn->sock = socket(af, SOCK_STREAM, IPPROTO_TCP)) < 0)
+ return -errno;
+
+ /* On the same host, source socket can be in TIME_WAIT */
+ setsockopt(conn->sock, SOL_SOCKET, SO_REUSEADDR,
+ &((int){ 1 }), sizeof(int));
+
+ if (bind(conn->sock, (struct sockaddr *)&a, sizeof(a)) < 0) {
+ close(conn->sock);
+ conn->sock = -1;
return -errno;
+ }
- debug("Receive queue sequence %u for socket %i", conn->sock_seq_rcv, s);
+ rc = tcp_flow_repair_on(c, conn);
+ if (rc) {
+ close(conn->sock);
+ conn->sock = -1;
+ return rc;
+ }
return 0;
}
+/**
+ * tcp_flow_repair_connect() - Connect sockets in repair mode, then turn it off
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_connect(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ struct flowside *tgt = &conn->f.side[TGTSIDE];
+ struct tcp_repair_opt opts[2];
+
+ tcp_flow_repair_seq(c, conn, true);
+
+ flowside_connect(c, conn->sock, PIF_HOST, tgt);
+
+ /* FIXME: Fetch those with TCP_REPAIR_OPTIONS and store in migration
+ * data. These hardcoded values just happen to be good enough.
+ *
+ * On top of these, to seamlessly restore the window, we also need to
+ * dump and restore struct tcp_repair_window via TCP_REPAIR_WINDOW.
+ */
+ opts[0].opt_code = TCPOPT_WINDOW;
+ opts[0].opt_val = 8 + (8 << 16);
+
+ opts[1].opt_code = TCPOPT_MAXSEG;
+ opts[1].opt_val = 65495;
+
+ setsockopt(conn->sock, SOL_TCP, TCP_REPAIR_OPTIONS,
+ opts, 2 * sizeof(struct tcp_repair_opt));
+
+ conn->in_epoll = 0;
+ conn->timer = -1;
+ tcp_epoll_ctl(c, conn);
+
+ return tcp_flow_repair_off(c, conn);
+}
+
/**
* tcp_fill_header() - Fill the TCP header fields for a given TCP segment.
*
diff --git a/tcp_conn.h b/tcp_conn.h
index 0c3e197..3bf8837 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -144,7 +144,9 @@ extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
-int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_seq(struct ctx *c, struct tcp_tap_conn *conn, bool set);
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_connect(struct ctx *c, struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
--
@@ -144,7 +144,9 @@ extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
int tcp_flow_repair_on(struct ctx *c, const struct tcp_tap_conn *conn);
-int tcp_flow_dump_seq(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_seq(struct ctx *c, struct tcp_tap_conn *conn, bool set);
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_flow_repair_connect(struct ctx *c, struct tcp_tap_conn *conn);
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
--
2.43.0
^ permalink raw reply related [flat|nested] 21+ messages in thread