* [PATCH v23 0/4] State migration
@ 2025-02-14 9:08 David Gibson
2025-02-14 9:08 ` [PATCH v23 1/4] flow: Flow table traversing macros David Gibson
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: David Gibson @ 2025-02-14 9:08 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
Brain dead today, didn't make much progress on bugs. In particular
migrate/iperf3_bidir6 still fails for me with an EAGAIN trying to
restore the sndq (the sent but not acked part, in repair mode).
I did, however, do a substantial restructure of how we handle flow
migration, which should make it easier to tweak our "pre" and "post"
steps for each flow. I *think* disabling/re-enabling nonblocking mode
on our sockets in the new freeze/thaw handlers will fix the EAGAIN
problem, but I ran out of time before trying it.
David Gibson (2):
flow: Flow table traversing macros
flow, migrate: Flow migration skeleton
Stefano Brivio (2):
migrate: Migrate TCP flows
test: Add migration tests
contrib/selinux/passt.te | 4 +-
flow.c | 309 +++++++++++-
flow.h | 13 +
flow_table.h | 52 ++
migrate.c | 47 +-
migrate.h | 2 +
passt.c | 6 +-
repair.c | 1 -
tcp.c | 918 ++++++++++++++++++++++++++++++++++++
tcp_conn.h | 99 ++++
test/lib/layout | 55 ++-
test/lib/setup | 138 +++++-
test/lib/test | 48 ++
test/migrate/basic | 59 +++
test/migrate/bidirectional | 64 +++
test/migrate/iperf3_bidir6 | 58 +++
test/migrate/iperf3_in4 | 50 ++
test/migrate/iperf3_in6 | 58 +++
test/migrate/iperf3_out4 | 50 ++
test/migrate/iperf3_out6 | 58 +++
test/migrate/rampstream_in | 10 +-
test/migrate/rampstream_out | 6 +-
test/run | 34 +-
23 files changed, 2108 insertions(+), 31 deletions(-)
create mode 100644 test/migrate/basic
create mode 100644 test/migrate/bidirectional
create mode 100644 test/migrate/iperf3_bidir6
create mode 100644 test/migrate/iperf3_in4
create mode 100644 test/migrate/iperf3_in6
create mode 100644 test/migrate/iperf3_out4
create mode 100644 test/migrate/iperf3_out6
--
2.48.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH v23 1/4] flow: Flow table traversing macros
2025-02-14 9:08 [PATCH v23 0/4] State migration David Gibson
@ 2025-02-14 9:08 ` David Gibson
2025-02-14 9:08 ` [PATCH v23 2/4] flow, migrate: Flow migration skeleton David Gibson
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2025-02-14 9:08 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
The migration code adds more places that need to iterate through the flow
table. Introduce some macros to make that easier.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
flow.c | 14 +++++++-------
flow_table.h | 31 +++++++++++++++++++++++++++++++
2 files changed, 38 insertions(+), 7 deletions(-)
diff --git a/flow.c b/flow.c
index 3ac551bd..d9b888ce 100644
--- a/flow.c
+++ b/flow.c
@@ -771,7 +771,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
struct flow_free_cluster *free_head = NULL;
unsigned *last_next = &flow_first_free;
bool timer = false;
- unsigned idx;
+ union flow *flow;
if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
timer = true;
@@ -780,8 +780,7 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
ASSERT(!flow_new_entry); /* Incomplete flow at end of cycle */
- for (idx = 0; idx < FLOW_MAX; idx++) {
- union flow *flow = &flowtab[idx];
+ flow_foreach_slot(flow) {
bool closed = false;
switch (flow->f.state) {
@@ -798,12 +797,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
} else {
/* New free cluster, add to chain */
free_head = &flow->free;
- *last_next = idx;
+ *last_next = FLOW_IDX(flow);
last_next = &free_head->next;
}
/* Skip remaining empty entries */
- idx += skip - 1;
+ flow += skip - 1;
continue;
}
@@ -856,14 +855,15 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
if (free_head) {
/* Add slot to current free cluster */
- ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
+ ASSERT(FLOW_IDX(flow) ==
+ FLOW_IDX(free_head) + free_head->n);
free_head->n++;
flow->free.n = flow->free.next = 0;
} else {
/* Create new free cluster */
free_head = &flow->free;
free_head->n = 1;
- *last_next = idx;
+ *last_next = FLOW_IDX(flow);
last_next = &free_head->next;
}
} else {
diff --git a/flow_table.h b/flow_table.h
index 9a2ff24a..6ff6d0f6 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -74,6 +74,37 @@ static inline unsigned flow_idx(const struct flow_common *f)
*/
#define FLOW(idx) (&flowtab[(idx)])
+/**
+ * flow_foreach_slot() - 'for' type macro to step through every flow slot
+ * @flow_: Points to each flow entry in order, including free slots
+ */
+#define flow_foreach_slot(flow_) \
+ for ((flow_) = flowtab; FLOW_IDX(flow_) < FLOW_MAX; (flow_)++)
+
+/**
+ * flow_foreach() - 'for' type macro to step through every active flow
+ * @flow_: Points to each active flow in order
+ */
+#define flow_foreach(flow_) \
+ flow_foreach_slot(flow_) \
+ if ((flow_)->f.state == FLOW_STATE_FREE) { \
+ (flow_) += (flow_)->free.n - 1; \
+ } else if ((flow)->f.state != FLOW_STATE_ACTIVE) { \
+ flow_err((flow_), "BUG: Traversing non-active flow"); \
+ continue; \
+ } else
+
+/**
+ * flow_foreach_of_type() - step through active flows of one type
+ * @flow_: Points to each active flow in order
+ * @type_: Flow type to select
+ */
+#define flow_foreach_of_type(flow_, type_) \
+ flow_foreach(flow_) \
+ if ((flow_)->f.type != (type_)) \
+ continue; \
+ else
+
/** flow_at_sidx() - Flow entry for a given sidx
* @sidx: Flow & side index
*
--
@@ -74,6 +74,37 @@ static inline unsigned flow_idx(const struct flow_common *f)
*/
#define FLOW(idx) (&flowtab[(idx)])
+/**
+ * flow_foreach_slot() - 'for' type macro to step through every flow slot
+ * @flow_: Points to each flow entry in order, including free slots
+ */
+#define flow_foreach_slot(flow_) \
+ for ((flow_) = flowtab; FLOW_IDX(flow_) < FLOW_MAX; (flow_)++)
+
+/**
+ * flow_foreach() - 'for' type macro to step through every active flow
+ * @flow_: Points to each active flow in order
+ */
+#define flow_foreach(flow_) \
+ flow_foreach_slot(flow_) \
+ if ((flow_)->f.state == FLOW_STATE_FREE) { \
+ (flow_) += (flow_)->free.n - 1; \
+ } else if ((flow)->f.state != FLOW_STATE_ACTIVE) { \
+ flow_err((flow_), "BUG: Traversing non-active flow"); \
+ continue; \
+ } else
+
+/**
+ * flow_foreach_of_type() - step through active flows of one type
+ * @flow_: Points to each active flow in order
+ * @type_: Flow type to select
+ */
+#define flow_foreach_of_type(flow_, type_) \
+ flow_foreach(flow_) \
+ if ((flow_)->f.type != (type_)) \
+ continue; \
+ else
+
/** flow_at_sidx() - Flow entry for a given sidx
* @sidx: Flow & side index
*
--
2.48.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v23 2/4] flow, migrate: Flow migration skeleton
2025-02-14 9:08 [PATCH v23 0/4] State migration David Gibson
2025-02-14 9:08 ` [PATCH v23 1/4] flow: Flow table traversing macros David Gibson
@ 2025-02-14 9:08 ` David Gibson
2025-02-14 9:08 ` [PATCH v23 3/4] migrate: Migrate TCP flows David Gibson
2025-02-14 9:08 ` [PATCH v23 4/4] test: Add migration tests David Gibson
3 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2025-02-14 9:08 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
Implement an outline of how to transfer the state of individual flows
during migration. We split the handling into three stages:
1) freeze
This selects which flows are eligible to be migrated, marking them with a
new flow state 'MIGRATING'. It makes any preparations for the flow to be
migrated, but writes no data to the state stream.
If migration fails, we roll back by executing step (3) on the source
instead of on the target.
2) transfer
This actually transfers data about the flows. Flows are created on the
target in MIGRATING state.
3) thaw
This "activates" the transferred flows, letting them resume normal
operation, but reads no additional data from the state stream. If this
fails for a flow, generally that flow is terminated, but we otherwise
carry on.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
flow.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++
flow.h | 5 ++
flow_table.h | 20 ++++++
migrate.c | 38 ++++++++---
migrate.h | 2 +
5 files changed, 231 insertions(+), 7 deletions(-)
diff --git a/flow.c b/flow.c
index d9b888ce..197cc116 100644
--- a/flow.c
+++ b/flow.c
@@ -27,6 +27,7 @@ const char *flow_state_str[] = {
[FLOW_STATE_TGT] = "TGT",
[FLOW_STATE_TYPED] = "TYPED",
[FLOW_STATE_ACTIVE] = "ACTIVE",
+ [FLOW_STATE_MIGRATING] = "MIGRATING",
};
static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES,
"flow_state_str[] doesn't match enum flow_state");
@@ -874,6 +875,178 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
*last_next = FLOW_MAX;
}
+/**
+ * flow_freeze() - Select and prepare flows for migration
+ * @c: Execution context
+ * @stage: Migration stage information, unused
+ * @fd: Migration file descriptor, unused
+ *
+ * MUST NOT READ OR WRITE MIGRATION STREAM
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_freeze(struct ctx *c, const struct migrate_stage *stage, int fd)
+{
+ union flow *flow;
+
+ (void)stage;
+ (void)fd;
+ (void)c;
+
+ flow_foreach(flow) {
+ /* rc == 0 : not a migration candidate
+ * rc > 0 : migration candidate
+ * rc < 0 : error, fail migration
+ */
+ int rc;
+
+ switch (flow->f.type) {
+ default:
+ /* Otherwise assume it doesn't migrate */
+ rc = 0;
+ }
+
+ if (rc < 0) {
+ /* FIXME: rollback */
+ return -rc;
+ }
+ if (rc > 0)
+ flow_set_state(&flow->f, FLOW_STATE_MIGRATING);
+ }
+
+ return 0;
+}
+
+/**
+ * flow_thaw() - Resume flow operation after migration
+ * @c: Execution context
+ * @stage: Migration stage information, unused
+ * @fd: Migration file descriptor, unused
+ *
+ * MUST NOT READ OR WRITE MIGRATION STREAM
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_thaw(struct ctx *c, const struct migrate_stage *stage, int fd)
+{
+ struct flow_free_cluster *free_head = NULL;
+ unsigned *last_next = &flow_first_free;
+ union flow *flow;
+
+ (void)stage;
+ (void)fd;
+ (void)c;
+
+ /* FIXME: Share logic with flow_defer_handler to rebuild free list */
+ flow_foreach_slot(flow) {
+ int rc;
+
+ if (flow->f.state == FLOW_STATE_FREE) {
+ unsigned skip = flow->free.n;
+
+ /* First entry of a free cluster must have n >= 1 */
+ ASSERT(skip);
+
+ if (free_head) {
+ /* Merge into preceding free cluster */
+ free_head->n += flow->free.n;
+ flow->free.n = flow->free.next = 0;
+ } else {
+ /* New free cluster, add to chain */
+ free_head = &flow->free;
+ *last_next = FLOW_IDX(flow);
+ last_next = &free_head->next;
+ }
+
+ /* Skip remaining empty entries */
+ flow += skip - 1;
+ continue;
+ }
+
+ if (flow->f.state == FLOW_STATE_ACTIVE) {
+ free_head = NULL;
+ continue;
+ }
+
+ ASSERT(flow->f.state == FLOW_STATE_MIGRATING);
+
+ rc = 0;
+ switch (flow->f.type) {
+ default:
+ /* Bug. We marked a flow as migrating, but we don't
+ * know how to resume it */
+ ASSERT(0);
+ }
+
+ if (rc == 0) {
+ /* Successfully resumed flow */
+ flow_set_state(&flow->f, FLOW_STATE_ACTIVE);
+ free_head = NULL;
+ continue;
+ }
+
+ flow_err(flow, "Failed to unfreeze resume flow: %s",
+ strerror_(-rc));
+
+ flow_set_state(&flow->f, FLOW_STATE_FREE);
+ memset(flow, 0, sizeof(*flow));
+
+ if (free_head) {
+ /* Add slot to current free cluster */
+ ASSERT(FLOW_IDX(flow) ==
+ FLOW_IDX(free_head) + free_head->n);
+ free_head->n++;
+ flow->free.n = flow->free.next = 0;
+ } else {
+ /* Create new free cluster */
+ free_head = &flow->free;
+ free_head->n = 1;
+ *last_next = FLOW_IDX(flow);
+ last_next = &free_head->next;
+ }
+ }
+
+ *last_next = FLOW_MAX;
+
+ return 0;
+}
+
+/**
+ * flow_migrate_source() - Transfer migrating flows to device state stream
+ * @c: Execution context
+ * @stage: Migration stage information, unused
+ * @fd: Migration file descriptor
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, int fd)
+{
+ (void)c;
+ (void)stage;
+ (void)fd;
+
+ /* FIXME: todo */
+ return ENOTSUP;
+}
+
+/**
+ * flow_migrate_target() - Build flows from device state stream
+ * @c: Execution context
+ * @stage: Migration stage information, unused
+ * @fd: Migration file descriptor
+ *
+ * Return: 0 on success, positive error code on failure
+ */
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, int fd)
+{
+ (void)c;
+ (void)stage;
+ (void)fd;
+
+ /* FIXME: todo */
+ return ENOTSUP;
+}
+
/**
* flow_init() - Initialise flow related data structures
*/
diff --git a/flow.h b/flow.h
index 24ba3ef0..deb70eb1 100644
--- a/flow.h
+++ b/flow.h
@@ -82,6 +82,10 @@
* 'true' from flow type specific deferred or timer handler
* Caveats:
* - flow_alloc_cancel() may not be called on it
+ *
+ * MIGRATING - A flow in the process of live migration
+ * Caveats:
+ * - Must only exist during live migration process
*/
enum flow_state {
FLOW_STATE_FREE,
@@ -90,6 +94,7 @@ enum flow_state {
FLOW_STATE_TGT,
FLOW_STATE_TYPED,
FLOW_STATE_ACTIVE,
+ FLOW_STATE_MIGRATING,
FLOW_NUM_STATES,
};
diff --git a/flow_table.h b/flow_table.h
index 6ff6d0f6..bf7ee2ea 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -94,6 +94,21 @@ static inline unsigned flow_idx(const struct flow_common *f)
continue; \
} else
+/**
+ * flow_foreach_migrating() - step through migrating flows
+ * @flow_: Points to each active flow in order
+ */
+#define flow_foreach_migrating(flow_) \
+ flow_foreach_slot(flow_) \
+ if ((flow_)->f.state == FLOW_STATE_FREE) { \
+ (flow_) += (flow_)->free.n - 1; \
+ } else if ((flow)->f.state == FLOW_STATE_ACTIVE) { \
+ continue; \
+ } else if ((flow)->f.state != FLOW_STATE_MIGRATING) { \
+ flow_err((flow_), "BUG: Traversing non-active flow"); \
+ continue; \
+ } else
+
/**
* flow_foreach_of_type() - step through active flows of one type
* @flow_: Points to each active flow in order
@@ -209,4 +224,9 @@ void flow_activate(struct flow_common *f);
#define FLOW_ACTIVATE(flow_) \
(flow_activate(&(flow_)->f))
+int flow_freeze(struct ctx *c, const struct migrate_stage *stage, int fd);
+int flow_thaw(struct ctx *c, const struct migrate_stage *stage, int fd);
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, int fd);
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, int fd);
+
#endif /* FLOW_TABLE_H */
diff --git a/migrate.c b/migrate.c
index 1c590160..ca53494c 100644
--- a/migrate.c
+++ b/migrate.c
@@ -98,11 +98,27 @@ static int seen_addrs_target_v1(struct ctx *c,
/* Stages for version 1 */
static const struct migrate_stage stages_v1[] = {
+ {
+ .name = "freeze flows",
+ .source = flow_freeze,
+ .rollback = flow_thaw,
+ .target = NULL,
+ },
{
.name = "observed addresses",
.source = seen_addrs_source_v1,
.target = seen_addrs_target_v1,
},
+ {
+ .name = "transfer flows",
+ .source = flow_migrate_source,
+ .target = flow_migrate_target,
+ },
+ {
+ .name = "thaw flows",
+ .source = NULL,
+ .target = flow_thaw,
+ },
{ 0 },
};
@@ -145,14 +161,22 @@ static int migrate_source(struct ctx *c, int fd)
debug("Source side migration stage: %s", s->name);
- if ((ret = s->source(c, s, fd))) {
- err("Source migration stage: %s: %s, abort", s->name,
- strerror_(ret));
- return ret;
- }
+ if ((ret = s->source(c, s, fd)))
+ goto rollback;
}
return 0;
+
+rollback:
+ err("Source migration stage: %s: %s, aborting", s->name,
+ strerror_(ret));
+
+ while (s-- > v->s) {
+ if (s->rollback(c, s, fd))
+ die("Unable to roll back migration");
+ }
+
+ return ret;
}
/**
@@ -214,9 +238,9 @@ static int migrate_target(struct ctx *c, int fd)
debug("Target side migration stage: %s", s->name);
if ((ret = s->target(c, s, fd))) {
- err("Target migration stage: %s: %s, abort", s->name,
+ /* FIXME: Less brutal failure */
+ die("Target migration stage: %s: %s, abort", s->name,
strerror_(ret));
- return ret;
}
}
diff --git a/migrate.h b/migrate.h
index 2c51cd97..eca65226 100644
--- a/migrate.h
+++ b/migrate.h
@@ -23,11 +23,13 @@ struct migrate_header {
* struct migrate_stage - Callbacks and parameters for one stage of migration
* @name: Stage name (for debugging)
* @source: Callback to implement this stage on the source
+ * @rollback: Callback to roll back this stage on the source
* @target: Callback to implement this stage on the target
*/
struct migrate_stage {
const char *name;
int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd);
+ int (*rollback)(struct ctx *c, const struct migrate_stage *stage, int fd);
int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd);
/* Add here separate rollback callbacks if needed */
--
@@ -23,11 +23,13 @@ struct migrate_header {
* struct migrate_stage - Callbacks and parameters for one stage of migration
* @name: Stage name (for debugging)
* @source: Callback to implement this stage on the source
+ * @rollback: Callback to roll back this stage on the source
* @target: Callback to implement this stage on the target
*/
struct migrate_stage {
const char *name;
int (*source)(struct ctx *c, const struct migrate_stage *stage, int fd);
+ int (*rollback)(struct ctx *c, const struct migrate_stage *stage, int fd);
int (*target)(struct ctx *c, const struct migrate_stage *stage, int fd);
/* Add here separate rollback callbacks if needed */
--
2.48.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v23 3/4] migrate: Migrate TCP flows
2025-02-14 9:08 [PATCH v23 0/4] State migration David Gibson
2025-02-14 9:08 ` [PATCH v23 1/4] flow: Flow table traversing macros David Gibson
2025-02-14 9:08 ` [PATCH v23 2/4] flow, migrate: Flow migration skeleton David Gibson
@ 2025-02-14 9:08 ` David Gibson
2025-02-14 9:08 ` [PATCH v23 4/4] test: Add migration tests David Gibson
3 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2025-02-14 9:08 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
From: Stefano Brivio <sbrivio@redhat.com>
This implements flow preparation on the source, transfer of data with
a format roughly inspired by struct tcp_tap_conn, and flow insertion
on the target, with all the appropriate window options, window
scaling, MSS, etc.
The target side is rather convoluted because we first need to create
sockets and switch them to repair mode, before we can apply options
that are *not* stored in the flow table. However, we don't want to
request repair mode for sockets one by one. So we need to do this in
several steps.
[dwg: Assorted cleanups]
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
contrib/selinux/passt.te | 4 +-
flow.c | 168 ++++++-
flow.h | 8 +
flow_table.h | 1 +
migrate.c | 9 +
passt.c | 6 +-
repair.c | 1 -
tcp.c | 918 +++++++++++++++++++++++++++++++++++++++
tcp_conn.h | 99 +++++
9 files changed, 1185 insertions(+), 29 deletions(-)
diff --git a/contrib/selinux/passt.te b/contrib/selinux/passt.te
index c6cea34f..3eb11e68 100644
--- a/contrib/selinux/passt.te
+++ b/contrib/selinux/passt.te
@@ -38,7 +38,7 @@ require {
type net_conf_t;
type proc_net_t;
type node_t;
- class tcp_socket { create accept listen name_bind name_connect };
+ class tcp_socket { create accept listen name_bind name_connect getattr };
class udp_socket { create accept listen };
class icmp_socket { bind create name_bind node_bind setopt read write };
class sock_file { create unlink write };
@@ -119,7 +119,7 @@ corenet_udp_sendrecv_all_ports(passt_t)
allow passt_t node_t:icmp_socket { name_bind node_bind };
allow passt_t port_t:icmp_socket name_bind;
-allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write };
+allow passt_t self:tcp_socket { create getopt setopt connect bind listen accept shutdown read write getattr };
allow passt_t self:udp_socket { create getopt setopt connect bind read write };
allow passt_t self:icmp_socket { bind create setopt read write };
diff --git a/flow.c b/flow.c
index 197cc116..8f5fbfe1 100644
--- a/flow.c
+++ b/flow.c
@@ -19,6 +19,7 @@
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
+#include "repair.h"
const char *flow_state_str[] = {
[FLOW_STATE_FREE] = "FREE",
@@ -875,6 +876,23 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
*last_next = FLOW_MAX;
}
+/**
+ * flow_alloc_migrate() - Allocate a new flow to be migrated in
+ *
+ * Return: pointer to an unused flow entry, or NULL if the table is full
+ */
+union flow *flow_alloc_migrate(void)
+{
+ union flow *flow = flow_alloc();
+
+ if (!flow)
+ return NULL;
+
+ flow_set_state(&flow->f, FLOW_STATE_MIGRATING);
+ flow_new_entry = NULL;
+ return flow;
+}
+
/**
* flow_freeze() - Select and prepare flows for migration
* @c: Execution context
@@ -888,19 +906,21 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
int flow_freeze(struct ctx *c, const struct migrate_stage *stage, int fd)
{
union flow *flow;
+ int rc;
(void)stage;
(void)fd;
- (void)c;
flow_foreach(flow) {
/* rc == 0 : not a migration candidate
* rc > 0 : migration candidate
* rc < 0 : error, fail migration
*/
- int rc;
switch (flow->f.type) {
+ case FLOW_TCP:
+ rc = tcp_freeze(c, &flow->tcp);
+ break;
default:
/* Otherwise assume it doesn't migrate */
rc = 0;
@@ -914,6 +934,13 @@ int flow_freeze(struct ctx *c, const struct migrate_stage *stage, int fd)
flow_set_state(&flow->f, FLOW_STATE_MIGRATING);
}
+ if ((rc = repair_flush(c))) {
+ debug("Can't enable repair mode: %s", strerror_(-rc));
+ if (flow_thaw(c, stage, fd))
+ die("Unable to roll back migration");
+ return rc;
+ }
+
return 0;
}
@@ -932,15 +959,13 @@ int flow_thaw(struct ctx *c, const struct migrate_stage *stage, int fd)
struct flow_free_cluster *free_head = NULL;
unsigned *last_next = &flow_first_free;
union flow *flow;
+ int rc;
(void)stage;
(void)fd;
- (void)c;
/* FIXME: Share logic with flow_defer_handler to rebuild free list */
flow_foreach_slot(flow) {
- int rc;
-
if (flow->f.state == FLOW_STATE_FREE) {
unsigned skip = flow->free.n;
@@ -970,23 +995,31 @@ int flow_thaw(struct ctx *c, const struct migrate_stage *stage, int fd)
ASSERT(flow->f.state == FLOW_STATE_MIGRATING);
- rc = 0;
+ /* rc > 0 : migration completed successfully
+ * rc == 0 : migration failed, clear flow
+ * rc < 0 : unrecoverable error, fail migration
+ */
switch (flow->f.type) {
+ case FLOW_TCP:
+ rc = tcp_thaw(c, &flow->tcp);
+ break;
default:
/* Bug. We marked a flow as migrating, but we don't
* know how to resume it */
ASSERT(0);
}
- if (rc == 0) {
+ if (rc < 0)
+ die("Unrecoverable migration error");
+
+ if (rc > 0) {
/* Successfully resumed flow */
flow_set_state(&flow->f, FLOW_STATE_ACTIVE);
free_head = NULL;
continue;
}
- flow_err(flow, "Failed to unfreeze resume flow: %s",
- strerror_(-rc));
+ flow_err(flow, "Failed to thaw flow");
flow_set_state(&flow->f, FLOW_STATE_FREE);
memset(flow, 0, sizeof(*flow));
@@ -1007,44 +1040,133 @@ int flow_thaw(struct ctx *c, const struct migrate_stage *stage, int fd)
}
*last_next = FLOW_MAX;
+
+ if ((rc = repair_flush(c))) {
+ debug("Can't disable repair mode: %s", strerror_(-rc));
+ return rc;
+ }
return 0;
}
/**
- * flow_migrate_source() - Transfer migrating flows to device state stream
- * @c: Execution context
- * @stage: Migration stage information, unused
+ * flow_migrate_source() - Dump all the remaining information and send data
+ * @c: Execution context (unused)
+ * @stage: Migration stage information (unused)
* @fd: Migration file descriptor
*
* Return: 0 on success, positive error code on failure
*/
-int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, int fd)
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
+ int fd)
{
+ uint32_t count = 0;
+ union flow *flow;
+ int rc;
+
(void)c;
(void)stage;
- (void)fd;
- /* FIXME: todo */
- return ENOTSUP;
+ flow_foreach_migrating(flow)
+ count++;
+
+ count = htonl(count);
+ if (write_all_buf(fd, &count, sizeof(count))) {
+ rc = errno;
+ err_perror("Can't send flow count (%u)", ntohl(count));
+ return rc;
+ }
+
+ debug("Sending %u flows", ntohl(count));
+
+ /* Dump and send information that can be stored in the flow table */
+ flow_foreach_migrating(flow) {
+ switch (flow->f.type) {
+ case FLOW_TCP:
+ if ((rc = tcp_flow_migrate_source(fd, &flow->tcp))) {
+ flow_err(flow, "Can't send data: %s", strerror_(-rc));
+ return -rc;
+ }
+ break;
+ default:
+ /* Bug. Flow marked for migration, but we don't know how */
+ ASSERT(0);
+ }
+ }
+
+ /* And then "extended" data (including window data we saved previously):
+ * the target needs to set repair mode on sockets before it can set
+ * this stuff, but it needs sockets (and flows) for that.
+ *
+ * This also closes sockets so that the target can start connecting
+ * theirs: you can't sendmsg() to queues (using the socket) if the
+ * socket is not connected (EPIPE), not even in repair mode. And the
+ * target needs to restore queues now because we're sending the data.
+ *
+ * So, no rollback here, just try as hard as we can.
+ */
+ flow_foreach_migrating(flow) {
+ switch (flow->f.type) {
+ case FLOW_TCP:
+ if ((rc = tcp_flow_migrate_source_ext(fd, &flow->tcp))) {
+ flow_err(flow, "Extended data: %s", strerror_(-rc));
+ return rc;
+ }
+ break;
+ default:
+ /* Bug. Flow marked for migration, but we don't know how */
+ ASSERT(0);
+ }
+ }
+
+ return 0;
}
/**
- * flow_migrate_target() - Build flows from device state stream
+ * flow_migrate_target() - Receive flows and insert in flow table
* @c: Execution context
- * @stage: Migration stage information, unused
+ * @stage: Migration stage information (unused)
* @fd: Migration file descriptor
*
* Return: 0 on success, positive error code on failure
*/
-int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage, int fd)
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
+ int fd)
{
- (void)c;
+ uint32_t count;
+ unsigned i;
+ int rc;
+
(void)stage;
- (void)fd;
- /* FIXME: todo */
- return ENOTSUP;
+ if (read_all_buf(fd, &count, sizeof(count)))
+ return errno;
+
+ count = ntohl(count);
+ debug("Receiving %u flows", count);
+
+ /* TODO: flow header with type, instead? */
+ for (i = 0; i < count; i++) {
+ rc = tcp_flow_migrate_target(c, fd);
+ if (rc) {
+ debug("Bad target data for flow %u: %s, abort",
+ i, strerror_(-rc));
+ return -rc;
+ }
+ }
+
+ repair_flush(c);
+
+ for (i = 0; i < count; i++) {
+ rc = tcp_flow_migrate_target_ext(c, flowtab + i, fd);
+ if (rc) {
+ debug("Bad target extended data for flow %u: %s, abort",
+ i, strerror_(-rc));
+ return -rc;
+ }
+ }
+
+ return 0;
}
/**
diff --git a/flow.h b/flow.h
index deb70eb1..eb81bbc7 100644
--- a/flow.h
+++ b/flow.h
@@ -254,6 +254,14 @@ union flow;
void flow_init(void);
void flow_defer_handler(const struct ctx *c, const struct timespec *now);
+int flow_migrate_source_early(struct ctx *c, const struct migrate_stage *stage,
+ int fd);
+int flow_migrate_source_pre(struct ctx *c, const struct migrate_stage *stage,
+ int fd);
+int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage,
+ int fd);
+int flow_migrate_target(struct ctx *c, const struct migrate_stage *stage,
+ int fd);
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
__attribute__((format(printf, 3, 4)));
diff --git a/flow_table.h b/flow_table.h
index bf7ee2ea..32638484 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -224,6 +224,7 @@ void flow_activate(struct flow_common *f);
#define FLOW_ACTIVATE(flow_) \
(flow_activate(&(flow_)->f))
+union flow *flow_alloc_migrate(void);
int flow_freeze(struct ctx *c, const struct migrate_stage *stage, int fd);
int flow_thaw(struct ctx *c, const struct migrate_stage *stage, int fd);
int flow_migrate_source(struct ctx *c, const struct migrate_stage *stage, int fd);
diff --git a/migrate.c b/migrate.c
index ca53494c..0e420ca7 100644
--- a/migrate.c
+++ b/migrate.c
@@ -104,6 +104,15 @@ static const struct migrate_stage stages_v1[] = {
.rollback = flow_thaw,
.target = NULL,
},
+ /* FIXME: With this step, close() in tcp_flow_migrate_source_ext()
+ * *sometimes* closes the connection for real.
+ */
+/* {
+ .name = "shrink TCP windows",
+ .source = flow_migrate_source_early,
+ .target = NULL,
+ },
+*/
{
.name = "observed addresses",
.source = seen_addrs_source_v1,
diff --git a/passt.c b/passt.c
index 6f9fb4d9..68d1a283 100644
--- a/passt.c
+++ b/passt.c
@@ -223,9 +223,6 @@ int main(int argc, char **argv)
if (sigaction(SIGCHLD, &sa, NULL))
die_perror("Couldn't install signal handlers");
- if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
- die_perror("Couldn't set disposition for SIGPIPE");
-
c.mode = MODE_PASTA;
} else if (strstr(name, "passt")) {
c.mode = MODE_PASST;
@@ -233,6 +230,9 @@ int main(int argc, char **argv)
_exit(EXIT_FAILURE);
}
+ if (signal(SIGPIPE, SIG_IGN) == SIG_ERR)
+ die_perror("Couldn't set disposition for SIGPIPE");
+
madvise(pkt_buf, TAP_BUF_BYTES, MADV_HUGEPAGE);
c.epollfd = epoll_create1(EPOLL_CLOEXEC);
diff --git a/repair.c b/repair.c
index d2886173..c2e04501 100644
--- a/repair.c
+++ b/repair.c
@@ -197,7 +197,6 @@ int repair_flush(struct ctx *c)
*
* Return: 0 on success, negative error code on failure
*/
-/* cppcheck-suppress unusedFunction */
int repair_set(struct ctx *c, int s, int cmd)
{
int rc;
diff --git a/tcp.c b/tcp.c
index b978b30d..6a57e617 100644
--- a/tcp.c
+++ b/tcp.c
@@ -280,6 +280,7 @@
#include <stddef.h>
#include <string.h>
#include <sys/epoll.h>
+#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/timerfd.h>
#include <sys/types.h>
@@ -287,6 +288,8 @@
#include <time.h>
#include <arpa/inet.h>
+#include <linux/sockios.h>
+
#include "checksum.h"
#include "util.h"
#include "iov.h"
@@ -299,6 +302,7 @@
#include "log.h"
#include "inany.h"
#include "flow.h"
+#include "repair.h"
#include "linux_dep.h"
#include "flow_table.h"
@@ -326,6 +330,19 @@
((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
#define CONN_HAS(conn, set) (((conn)->events & (set)) == (set))
+/* Buffers to migrate pending data from send and receive queues. No, they don't
+ * use memory if we don't use them. And we're going away after this, so splurge.
+ */
+#define TCP_MIGRATE_SND_QUEUE_MAX (64 << 20)
+#define TCP_MIGRATE_RCV_QUEUE_MAX (64 << 20)
+uint8_t tcp_migrate_snd_queue [TCP_MIGRATE_SND_QUEUE_MAX];
+uint8_t tcp_migrate_rcv_queue [TCP_MIGRATE_RCV_QUEUE_MAX];
+
+#define TCP_MIGRATE_RESTORE_CHUNK_MIN 1024 /* Try smaller when above this */
+
+/* "Extended" data (not stored in the flow table) for TCP flow migration */
+static struct tcp_tap_transfer_ext migrate_ext[FLOW_MAX];
+
static const char *tcp_event_str[] __attribute((__unused__)) = {
"SOCK_ACCEPTED", "TAP_SYN_RCVD", "ESTABLISHED", "TAP_SYN_ACK_SENT",
@@ -1468,6 +1485,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
conn->sock = s;
conn->timer = -1;
+ conn->listening_sock = -1;
conn_event(c, conn, TAP_SYN_RCVD);
conn->wnd_to_tap = WINDOW_DEFAULT;
@@ -1968,10 +1986,27 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
ack_due = 1;
if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
+ socklen_t sl;
+ struct tcp_info tinfo;
+
shutdown(conn->sock, SHUT_WR);
conn_event(c, conn, SOCK_FIN_SENT);
tcp_send_flag(c, conn, ACK);
ack_due = 0;
+
+ /* If we received a FIN, but the socket is in TCP_ESTABLISHED
+ * state, it must be a migrated socket. The kernel saw the FIN
+ * on the source socket, but not on the target socket.
+ *
+ * Approximate the effect of that FIN: as we're sending a FIN
+ * out ourselves, the socket is now in a state equivalent to
+ * LAST_ACK. Now that we sent the FIN out, close it with a RST.
+ */
+ sl = sizeof(tinfo);
+ getsockopt(conn->sock, SOL_TCP, TCP_INFO, &tinfo, &sl);
+ if (tinfo.tcpi_state == TCP_ESTABLISHED &&
+ conn->events & SOCK_FIN_RCVD)
+ goto reset;
}
if (ack_due)
@@ -2054,6 +2089,7 @@ static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
+ struct tcp_tap_conn *conn;
union sockaddr_inany sa;
socklen_t sl = sizeof(sa);
struct flowside *ini;
@@ -2069,6 +2105,9 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
if (s < 0)
goto cancel;
+ conn = (struct tcp_tap_conn *)flow;
+ conn->listening_sock = ref.fd;
+
tcp_sock_set_nodelay(s);
/* FIXME: If useful: when the listening port has a specific bound
@@ -2634,3 +2673,882 @@ void tcp_timer(struct ctx *c, const struct timespec *now)
if (c->mode == MODE_PASTA)
tcp_splice_refill(c);
}
+
+/**
+ * tcp_freeze() - Prepare TCP flow for migration
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 1 if migratable, 0 if not migratable, negative error code on failure
+ */
+int tcp_freeze(struct ctx *c, const struct tcp_tap_conn *conn)
+{
+ int rc = 0;
+
+ if (!(conn->events & ESTABLISHED))
+ return 0;
+
+ /* Disable SO_PEEK_OFF, we don't want it for repair mode */
+ if (tcp_set_peek_offset(conn->sock, -1))
+ return -errno;
+
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON))) {
+ err("Failed to set TCP_REPAIR");
+ return rc;
+ }
+
+ return 1;
+}
+
+/**
+ * tcp_thaw() - Final resume of flow after migration
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 1 if thawed, 0 if not thawed, negative error code on unrecoverable
+ * failure
+ */
+int tcp_thaw(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+ int rc = 0;
+
+ /* Might already be done, but that's ok it's idempotent */
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF))) {
+ err("Failed to clear TCP_REPAIR");
+ return rc;
+ }
+
+ /* Re-enable SO_PEEK_OFF, when available */
+ if (tcp_set_peek_offset(conn->sock, peek_offset))
+ tcp_rst(c, conn);
+
+ tcp_send_flag(c, conn, ACK);
+ tcp_data_from_sock(c, conn);
+
+ return 1;
+}
+
+/**
+ * tcp_flow_dump_tinfo() - Dump window scale, tcpi_state, tcpi_options
+ * @c: Execution context
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_tinfo(int s, struct tcp_tap_transfer_ext *t)
+{
+ struct tcp_info tinfo;
+ socklen_t sl;
+
+ sl = sizeof(tinfo);
+ if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+ int rc = -errno;
+ err_perror("Querying TCP_INFO, socket %i", s);
+ return rc;
+ }
+
+ t->snd_ws = tinfo.tcpi_snd_wscale;
+ t->rcv_ws = tinfo.tcpi_rcv_wscale;
+ t->tcpi_state = tinfo.tcpi_state;
+ t->tcpi_options = tinfo.tcpi_options;
+
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_mss() - Dump MSS clamp (not current MSS) via TCP_MAXSEG
+ * @c: Execution context
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_mss(int s, struct tcp_tap_transfer_ext *t)
+{
+ socklen_t sl = sizeof(t->mss);
+
+ if (getsockopt(s, SOL_TCP, TCP_MAXSEG, &t->mss, &sl)) {
+ int rc = -errno;
+ err_perror("Getting MSS, socket %i", s);
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_wnd() - Dump current tcp_repair_window parameters
+ * @c: Execution context
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_wnd(int s, struct tcp_tap_transfer_ext *t)
+{
+ struct tcp_repair_window wnd;
+ socklen_t sl = sizeof(wnd);
+
+ if (getsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, &sl)) {
+ int rc = -errno;
+ err_perror("Getting window repair data, socket %i", s);
+ return rc;
+ }
+
+ t->snd_wl1 = wnd.snd_wl1;
+ t->snd_wnd = wnd.snd_wnd;
+ t->max_window = wnd.max_window;
+ t->rcv_wnd = wnd.rcv_wnd;
+ t->rcv_wup = wnd.rcv_wup;
+
+ /* If we received a FIN, we also need to adjust window parameters.
+ *
+ * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+ */
+ if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK) {
+ t->rcv_wup--;
+ t->rcv_wnd++;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_wnd() - Restore window parameters from extended data
+ * @c: Execution context
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_wnd(int s, const struct tcp_tap_transfer_ext *t)
+{
+ struct tcp_repair_window wnd;
+
+ wnd.snd_wl1 = t->snd_wl1;
+ wnd.snd_wnd = t->snd_wnd;
+ wnd.max_window = t->max_window;
+ wnd.rcv_wnd = t->rcv_wnd;
+ wnd.rcv_wup = t->rcv_wup;
+
+ if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sizeof(wnd))) {
+ int rc = -errno;
+ err_perror("Setting window data, socket %i", s);
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_select_queue() - Select queue (receive or send) for next operation
+ * @s: Socket
+ * @queue: TCP_RECV_QUEUE or TCP_SEND_QUEUE
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_select_queue(int s, int queue)
+{
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue))) {
+ int rc = -errno;
+ err_perror("Selecting TCP_SEND_QUEUE, socket %i", s);
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_sndqueue() - Dump send queue, length of sent and not sent data
+ * @s: Socket
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_dump_sndqueue(int s, struct tcp_tap_transfer_ext *t)
+{
+ ssize_t rc;
+
+ if (ioctl(s, SIOCOUTQ, &t->sndq) < 0) {
+ rc = -errno;
+ err_perror("Getting send queue size, socket %i", s);
+ return rc;
+ }
+
+ if (ioctl(s, SIOCOUTQNSD, &t->notsent) < 0) {
+ rc = -errno;
+ err_perror("Getting not sent count, socket %i", s);
+ return rc;
+ }
+
+ /* If we sent a FIN, SIOCOUTQ and SIOCOUTQNSD are one greater than the
+ * actual pending queue length, because they are based on the sequence
+ * numbers, not directly on the buffer contents.
+ *
+ * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+ */
+ if (t->tcpi_state == TCP_FIN_WAIT1 || t->tcpi_state == TCP_FIN_WAIT2 ||
+ t->tcpi_state == TCP_LAST_ACK || t->tcpi_state == TCP_CLOSING) {
+ if (t->sndq)
+ t->sndq--;
+ if (t->notsent)
+ t->notsent--;
+ }
+
+ if (t->notsent > t->sndq) {
+ err("Invalid notsent count socket %i, send: %u, not sent: %u",
+ s, t->sndq, t->notsent);
+ return -EINVAL;
+ }
+
+ if (t->sndq > TCP_MIGRATE_SND_QUEUE_MAX) {
+ err("Send queue too large to migrate socket %i: %u bytes",
+ s, t->sndq);
+ return -ENOBUFS;
+ }
+
+ rc = recv(s, tcp_migrate_snd_queue,
+ MIN(t->sndq, TCP_MIGRATE_SND_QUEUE_MAX), MSG_PEEK);
+ if (rc < 0) {
+ if (errno == EAGAIN) { /* EAGAIN means empty */
+ rc = 0;
+ } else {
+ rc = -errno;
+ err_perror("Can't read send queue, socket %i", s);
+ return rc;
+ }
+ }
+
+ if (rc < t->sndq) {
+ err("Short read migrating send queue");
+ return -ENXIO;
+ }
+
+ t->notsent = MIN(t->notsent, t->sndq);
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_queue() - Restore contents of a given (pre-selected) queue
+ * @s: Socket
+ * @len: Length of data to be restored
+ * @buf: Buffer with content of pending data queue
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_queue(int s, size_t len, uint8_t *buf)
+{
+ size_t chunk = len;
+ uint8_t *p = buf;
+
+ while (len > 0) {
+ ssize_t rc = send(s, p, MIN(len, chunk), 0);
+
+ if (rc < 0) {
+ if ((errno == ENOBUFS || errno == ENOMEM) &&
+ chunk >= TCP_MIGRATE_RESTORE_CHUNK_MIN) {
+ chunk /= 2;
+ continue;
+ }
+
+ rc = -errno;
+ err_perror("Can't write queue, socket %i", s);
+ return rc;
+ }
+
+ len -= rc;
+ p += rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_seq() - Dump current sequence of pre-selected queue
+ * @s: Socket
+ * @v: Sequence value, set on return
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_dump_seq(int s, uint32_t *v)
+{
+ socklen_t sl = sizeof(*v);
+
+ if (getsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, &sl)) {
+ int rc = -errno;
+ err_perror("Dumping sequence, socket %i", s);
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_seq() - Restore sequence for pre-selected queue
+ * @s: Socket
+ * @v: Sequence value to be set
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_seq(int s, const uint32_t *v)
+{
+ if (setsockopt(s, SOL_TCP, TCP_QUEUE_SEQ, v, sizeof(*v))) {
+ int rc = -errno;
+ err_perror("Setting sequence, socket %i", s);
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_dump_rcvqueue() - Dump receive queue and its length, seal/block it
+ * @s: Socket
+ * @t: Extended migration data
+ * @filled: Bytes we injected in the queue to block it, set on return
+ *
+ * Return: 0 on success, negative error code on failure
+ *
+ * #syscalls:vu ioctl
+ */
+static int tcp_flow_dump_rcvqueue(int s, struct tcp_tap_transfer_ext *t,
+ size_t *filled)
+{
+ ssize_t rc, n;
+
+ if (ioctl(s, SIOCINQ, &t->rcvq) < 0) {
+ rc = -errno;
+ err_perror("Get receive queue size, socket %i", s);
+ return rc;
+ }
+
+ /* Observed race, seemingly hard to reproduce: we dump queue content and
+ * receive sequence, but more data comes and is acknowledged meanwhile,
+ * so we lose it. Make sure the queue is full before we dump it, so that
+ * nothing can be appended.
+ *
+ * Note that these send() calls are not atomic, so this is again
+ * theoretically racy, but apparently not in practice. TODO: Fix this in
+ * the kernel.
+ */
+ do {
+ n = send(s, tcp_migrate_rcv_queue, TCP_MIGRATE_RCV_QUEUE_MAX,
+ 0);
+ if (n > 0)
+ *filled += n;
+ } while (n > 0);
+ debug("Filled up receive queue with %zi bytes", *filled);
+
+ /* If we received a FIN, SIOCINQ is one greater than the actual number
+ * of bytes on the queue, because it's based on the sequence number
+ * rather than directly on the buffer contents.
+ *
+ * This must be called after tcp_flow_dump_tinfo(), for t->tcpi_state.
+ */
+ if (t->tcpi_state == TCP_CLOSE_WAIT || t->tcpi_state == TCP_LAST_ACK)
+ t->rcvq--;
+
+ if (t->rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+ err("Receive queue too large to migrate socket %i: %u bytes",
+ s, t->rcvq);
+ return -ENOBUFS;
+ }
+
+ rc = recv(s, tcp_migrate_rcv_queue, t->rcvq, MSG_PEEK);
+ if (rc < 0) {
+ if (errno == EAGAIN) { /* EAGAIN means empty */
+ rc = 0;
+ } else {
+ rc = -errno;
+ err_perror("Can't read receive queue for socket %i", s);
+ return rc;
+ }
+ }
+
+ if (rc < t->rcvq) {
+ err("Short read migrating receive queue");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_opt() - Set repair "options" (MSS, scale, SACK, timestamps)
+ * @s: Socket
+ * @t: Extended migration data
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_opt(int s, const struct tcp_tap_transfer_ext *t)
+{
+ const struct tcp_repair_opt opts[] = {
+ { TCPOPT_WINDOW, t->snd_ws + (t->rcv_ws << 16) },
+ { TCPOPT_MAXSEG, t->mss },
+ { TCPOPT_SACK_PERMITTED, 0 },
+ { TCPOPT_TIMESTAMP, 0 },
+ };
+ socklen_t sl;
+
+ sl = sizeof(opts[0]) * (2 +
+ !!(t->tcpi_options & TCPI_OPT_SACK) +
+ !!(t->tcpi_options & TCPI_OPT_TIMESTAMPS));
+
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_OPTIONS, opts, sl)) {
+ int rc = -errno;
+ err_perror("Setting repair options, socket %i", s);
+ return rc;
+ }
+
+ return 0;
+}
+
+#if 0
+/**
+ * tcp_flow_migrate_shrink_window() - Dump window data, decrease socket window
+ * @flow: Flow to shrink window for
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_shrink_window(const union flow *flow,
+ const struct tcp_tap_conn *conn)
+{
+ struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(flow)];
+ struct tcp_repair_window wnd;
+ socklen_t sl = sizeof(wnd);
+ int s = conn->sock;
+
+ if (setsockopt(s, SOL_SOCKET, SO_RCVBUF, &((int){ 0 }), sizeof(int)))
+ debug("TCP: failed to set SO_RCVBUF to minimum value");
+
+ /* Dump window data as it is for the target, before touching stuff */
+ tcp_flow_dump_wnd(s, t);
+
+ wnd.rcv_wnd = 0;
+
+ if (setsockopt(s, IPPROTO_TCP, TCP_REPAIR_WINDOW, &wnd, sl))
+ debug_perror("Setting window repair data, socket %i", s);
+
+ return 0;
+}
+#endif
+
+/**
+ * tcp_flow_migrate_source() - Send data (flow table) for flow, close listening
+ * @fd: Descriptor for state migration
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn)
+{
+ struct tcp_tap_transfer t = {
+ .retrans = conn->retrans,
+ .ws_from_tap = conn->ws_from_tap,
+ .ws_to_tap = conn->ws_to_tap,
+ .events = conn->events,
+
+ .tap_mss = htonl(MSS_GET(conn)),
+
+ .sndbuf = htonl(conn->sndbuf),
+
+ .flags = conn->flags,
+ .seq_dup_ack_approx = conn->seq_dup_ack_approx,
+
+ .wnd_from_tap = htons(conn->wnd_from_tap),
+ .wnd_to_tap = htons(conn->wnd_to_tap),
+
+ .seq_to_tap = htonl(conn->seq_to_tap),
+ .seq_ack_from_tap = htonl(conn->seq_ack_from_tap),
+ .seq_from_tap = htonl(conn->seq_from_tap),
+ .seq_ack_to_tap = htonl(conn->seq_ack_to_tap),
+ .seq_init_from_tap = htonl(conn->seq_init_from_tap),
+ };
+
+ memcpy(&t.pif, conn->f.pif, sizeof(t.pif));
+ memcpy(&t.side, conn->f.side, sizeof(t.side));
+
+ if (conn->listening_sock != -1 && !fcntl(conn->listening_sock, F_GETFD))
+ close(conn->listening_sock);
+
+ if (write_all_buf(fd, &t, sizeof(t))) {
+ int rc = -errno;
+ err_perror("Can't write migration data, socket %i", conn->sock);
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_source_ext() - Dump queues, close sockets, send final data
+ * @fd: Descriptor for state migration
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn)
+{
+ struct tcp_tap_transfer_ext *t = &migrate_ext[FLOW_IDX(conn)];
+ size_t seq_rcv_rewind = 0;
+ int s = conn->sock;
+ int rc;
+
+ if ((rc = tcp_flow_dump_tinfo(s, t)))
+ return rc;
+
+ if ((rc = tcp_flow_dump_mss(s, t)))
+ return rc;
+
+ if ((rc = tcp_flow_dump_wnd(s, t)))
+ return rc;
+
+ if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE)))
+ return rc;
+
+ if ((rc = tcp_flow_dump_sndqueue(s, t)))
+ return rc;
+
+ if ((rc = tcp_flow_dump_seq(s, &t->seq_snd)))
+ return rc;
+
+ if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE)))
+ return rc;
+
+ if ((rc = tcp_flow_dump_rcvqueue(s, t, &seq_rcv_rewind)))
+ return rc;
+
+ if ((rc = tcp_flow_dump_seq(s, &t->seq_rcv)))
+ return rc;
+
+ close(s);
+
+ /* Adjustments unrelated to FIN segments: sequence numbers we dumped are
+ * based on the end of the queues.
+ */
+ t->seq_rcv -= t->rcvq + seq_rcv_rewind;
+ t->seq_snd -= t->sndq;
+
+ debug("Extended migration data, socket %i sequences send %u receive %u",
+ s, t->seq_snd, t->seq_rcv);
+ debug(" pending queues: send %u not sent %u receive %u",
+ t->sndq, t->notsent, t->rcvq);
+ debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+ t->snd_wl1, t->snd_wnd, t->max_window, t->rcv_wnd, t->rcv_wup);
+
+ /* Endianness fix-ups */
+ t->seq_snd = htonl(t->seq_snd);
+ t->seq_rcv = htonl(t->seq_rcv);
+ t->sndq = htonl(t->sndq);
+ t->notsent = htonl(t->notsent);
+ t->rcvq = htonl(t->rcvq);
+
+ t->snd_wl1 = htonl(t->snd_wl1);
+ t->snd_wnd = htonl(t->snd_wnd);
+ t->max_window = htonl(t->max_window);
+ t->rcv_wnd = htonl(t->rcv_wnd);
+ t->rcv_wup = htonl(t->rcv_wup);
+
+ if (write_all_buf(fd, t, sizeof(*t))) {
+ rc = -errno;
+ err_perror("Failed to write extended data, socket %i", s);
+ return rc;
+ }
+
+ if (write_all_buf(fd, tcp_migrate_snd_queue, ntohl(t->sndq))) {
+ rc = -errno;
+ err_perror("Failed to write send queue data, socket %i", s);
+ return rc;
+ }
+
+ if (write_all_buf(fd, tcp_migrate_rcv_queue, ntohl(t->rcvq))) {
+ rc = -errno;
+ err_perror("Failed to write receive queue data, socket %i", s);
+ return rc;
+ }
+
+ return 0;
+}
+
+/**
+ * tcp_flow_repair_socket() - Open and bind socket, request repair mode
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_repair_socket(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ sa_family_t af = CONN_V4(conn) ? AF_INET : AF_INET6;
+ const struct flowside *sockside = HOSTFLOW(conn);
+ union sockaddr_inany a;
+ socklen_t sl;
+ int s, rc;
+
+ pif_sockaddr(c, &a, &sl, PIF_HOST, &sockside->oaddr, sockside->oport);
+
+ if ((conn->sock = socket(af, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC,
+ IPPROTO_TCP)) < 0) {
+ rc = -errno;
+ err_perror("Failed to create socket for migrated flow");
+ return rc;
+ }
+ s = conn->sock;
+
+ if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &(int){ 1 }, sizeof(int)))
+ debug_perror("Setting SO_REUSEADDR on socket %i", s);
+
+ tcp_sock_set_nodelay(s);
+
+ if ((rc = bind(s, &a.sa, sizeof(a)))) {
+ err_perror("Failed to bind socket for migrated flow");
+ goto err;
+ }
+
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_ON))) {
+ err("Failed to set TCP_REPAIR on new socket");
+ goto err;
+ }
+
+ return 0;
+
+err:
+ close(s);
+ conn->sock = -1;
+ return rc;
+}
+
+/**
+ * tcp_flow_repair_connect() - Connect socket in repair mode, then turn it off
+ * @c: Execution context
+ * @conn: Pointer to the TCP connection structure
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int tcp_flow_repair_connect(const struct ctx *c,
+ struct tcp_tap_conn *conn)
+{
+ const struct flowside *tgt = HOSTFLOW(conn);
+ int rc;
+
+ rc = flowside_connect(c, conn->sock, PIF_HOST, tgt);
+ if (rc) {
+ rc = -errno;
+ err_perror("Failed to connect migrated socket %i", conn->sock);
+ return rc;
+ }
+
+ conn->in_epoll = 0;
+ conn->timer = -1;
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_target() - Receive data (flow table part) for flow, insert
+ * @c: Execution context
+ * @fd: Descriptor for state migration
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int tcp_flow_migrate_target(struct ctx *c, int fd)
+{
+ struct tcp_tap_transfer t;
+ struct tcp_tap_conn *conn;
+ union flow *flow;
+ int rc;
+
+ if (!(flow = flow_alloc_migrate())) {
+ err("Flow table full on migration target");
+ return -ENOMEM;
+ }
+
+ if (read_all_buf(fd, &t, sizeof(t))) {
+ err_perror("Failed to receive migration data");
+ return -errno;
+ }
+
+ memcpy(&flow->f.pif, &t.pif, sizeof(flow->f.pif));
+ memcpy(&flow->f.side, &t.side, sizeof(flow->f.side));
+
+ flow->f.type = FLOW_TCP;
+ conn = &flow->tcp;
+
+ conn->retrans = t.retrans;
+ conn->ws_from_tap = t.ws_from_tap;
+ conn->ws_to_tap = t.ws_to_tap;
+ conn->events = t.events;
+
+ conn->sndbuf = htonl(t.sndbuf);
+
+ conn->flags = t.flags;
+ conn->seq_dup_ack_approx = t.seq_dup_ack_approx;
+
+ MSS_SET(conn, ntohl(t.tap_mss));
+
+ conn->wnd_from_tap = ntohs(t.wnd_from_tap);
+ conn->wnd_to_tap = ntohs(t.wnd_to_tap);
+
+ conn->seq_to_tap = ntohl(t.seq_to_tap);
+ conn->seq_ack_from_tap = ntohl(t.seq_ack_from_tap);
+ conn->seq_from_tap = ntohl(t.seq_from_tap);
+ conn->seq_ack_to_tap = ntohl(t.seq_ack_to_tap);
+ conn->seq_init_from_tap = ntohl(t.seq_init_from_tap);
+
+ if ((rc = tcp_flow_repair_socket(c, conn)))
+ return rc;
+
+ flow_hash_insert(c, TAP_SIDX(conn));
+
+ return 0;
+}
+
+/**
+ * tcp_flow_migrate_target_ext() - Receive extended data for flow, set, connect
+ * @c: Execution context
+ * @flow: Existing flow for this connection data
+ * @fd: Descriptor for state migration
+ *
+ * Return: 0 on success, negative code on failure, but 0 on connection reset
+ */
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd)
+{
+ struct tcp_tap_conn *conn = &flow->tcp;
+ uint32_t peek_offset = conn->seq_to_tap - conn->seq_ack_from_tap;
+ struct tcp_tap_transfer_ext t;
+ int s = conn->sock, rc;
+
+ if (read_all_buf(fd, &t, sizeof(t))) {
+ rc = -errno;
+ err_perror("Failed to read extended data for socket %i", s);
+ return rc;
+ }
+
+ /* Endianness fix-ups */
+ t.seq_snd = ntohl(t.seq_snd);
+ t.seq_rcv = ntohl(t.seq_rcv);
+ t.sndq = ntohl(t.sndq);
+ t.notsent = ntohl(t.notsent);
+ t.rcvq = ntohl(t.rcvq);
+
+ t.snd_wl1 = ntohl(t.snd_wl1);
+ t.snd_wnd = ntohl(t.snd_wnd);
+ t.max_window = ntohl(t.max_window);
+ t.rcv_wnd = ntohl(t.rcv_wnd);
+ t.rcv_wup = ntohl(t.rcv_wup);
+
+ debug("Extended migration data, socket %i sequences send %u receive %u",
+ s, t.seq_snd, t.seq_rcv);
+ debug(" pending queues: send %u not sent %u receive %u",
+ t.sndq, t.notsent, t.rcvq);
+ debug(" window: snd_wl1 %u snd_wnd %u max %u rcv_wnd %u rcv_wup %u",
+ t.snd_wl1, t.snd_wnd, t.max_window, t.rcv_wnd, t.rcv_wup);
+ debug(" SO_PEEK_OFF %s offset=%"PRIu32,
+ peek_offset_cap ? "enabled" : "disabled", peek_offset);
+
+ if (t.sndq > TCP_MIGRATE_SND_QUEUE_MAX || t.notsent > t.sndq ||
+ t.rcvq > TCP_MIGRATE_RCV_QUEUE_MAX) {
+ err("Bad queues socket %i, send: %u, not sent: %u, receive: %u",
+ s, t.sndq, t.notsent, t.rcvq);
+ return -EINVAL;
+ }
+
+ if (read_all_buf(fd, tcp_migrate_snd_queue, t.sndq)) {
+ rc = -errno;
+ err_perror("Failed to read send queue data, socket %i", s);
+ return rc;
+ }
+
+ if (read_all_buf(fd, tcp_migrate_rcv_queue, t.rcvq)) {
+ rc = -errno;
+ err_perror("Failed to read receive queue data, socket %i", s);
+ return rc;
+ }
+
+ if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE)))
+ return rc;
+
+ if ((rc = tcp_flow_repair_seq(s, &t.seq_snd)))
+ return rc;
+
+ if ((rc = tcp_flow_select_queue(s, TCP_RECV_QUEUE)))
+ return rc;
+
+ if ((rc = tcp_flow_repair_seq(s, &t.seq_rcv)))
+ return rc;
+
+ if ((rc = tcp_flow_repair_connect(c, conn)))
+ return rc;
+
+ if ((rc = tcp_flow_repair_queue(s, t.rcvq, tcp_migrate_rcv_queue)))
+ return rc;
+
+ if ((rc = tcp_flow_select_queue(s, TCP_SEND_QUEUE)))
+ return rc;
+
+ if ((rc = tcp_flow_repair_queue(s, t.sndq - t.notsent,
+ tcp_migrate_snd_queue)))
+ return rc;
+
+ if ((rc = tcp_flow_repair_opt(s, &t)))
+ return rc;
+
+ /* If we sent a FIN sent and it was acknowledged (TCP_FIN_WAIT2), don't
+ * send it out, because we already sent it for sure.
+ *
+ * Call shutdown(x, SHUT_WR) in repair mode, so that we move to
+ * FIN_WAIT_1 (tcp_shutdown()) without sending anything
+ * (goto in tcp_write_xmit()).
+ */
+ if (t.tcpi_state == TCP_FIN_WAIT2) {
+ int v;
+
+ v = TCP_SEND_QUEUE;
+ if (setsockopt(s, SOL_TCP, TCP_REPAIR_QUEUE, &v, sizeof(v)))
+ debug_perror("Selecting repair queue, socket %i", s);
+ else
+ shutdown(s, SHUT_WR);
+ }
+
+ if ((rc = tcp_flow_repair_wnd(s, &t)))
+ return rc;
+
+ if ((rc = repair_set(c, conn->sock, TCP_REPAIR_OFF)))
+ return rc;
+ if ((rc = repair_flush(c)))
+ return rc;
+
+ if (t.notsent) {
+ err("socket %i, t.sndq=%u t.notsent=%u",
+ s, t.sndq, t.notsent);
+
+ if (tcp_flow_repair_queue(s, t.notsent,
+ tcp_migrate_snd_queue +
+ (t.sndq - t.notsent))) {
+ /* This sometimes seems to fail for unclear reasons.
+ * Don't fail the whole migration, just reset the flow
+ * and carry on to the next one.
+ */
+ tcp_rst(c, conn);
+ return 0;
+ }
+ }
+
+ /* If we sent a FIN but it wasn't acknowledged yet (TCP_FIN_WAIT1), send
+ * it out, because we don't know if we already sent it.
+ *
+ * Call shutdown(x, SHUT_WR) *not* in repair mode, which moves us to
+ * TCP_FIN_WAIT1.
+ */
+ if (t.tcpi_state == TCP_FIN_WAIT1)
+ shutdown(s, SHUT_WR);
+
+ if ((rc = tcp_epoll_ctl(c, conn))) {
+ debug("Failed to subscribe to epoll for migrated socket %i: %s",
+ conn->sock, strerror_(-rc));
+ }
+
+ return 0;
+}
diff --git a/tcp_conn.h b/tcp_conn.h
index 8c20805e..1b203f27 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -19,6 +19,7 @@
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
* @sock: Socket descriptor number
* @events: Connection events, implying connection states
+ * @listening_sock: Listening socket this socket was accept()ed from, or -1
* @timer: timerfd descriptor for timeout events
* @flags: Connection flags representing internal attributes
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
@@ -68,6 +69,7 @@ struct tcp_tap_conn {
#define CONN_STATE_BITS /* Setting these clears other flags */ \
(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
+ int listening_sock;
int timer :FD_REF_BITS;
@@ -96,6 +98,93 @@ struct tcp_tap_conn {
uint32_t seq_init_from_tap;
};
+/**
+ * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
+ * @pif: Interfaces for each side of the flow
+ * @side: Addresses and ports for each side of the flow
+ * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
+ * @ws_from_tap: Window scaling factor advertised from tap/guest
+ * @ws_to_tap: Window scaling factor advertised to tap/guest
+ * @events: Connection events, implying connection states
+ * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
+ * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
+ * @flags: Connection flags representing internal attributes
+ * @seq_dup_ack_approx: Last duplicate ACK number sent to tap
+ * @wnd_from_tap: Last window size from tap, unscaled (as received)
+ * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
+ * @seq_to_tap: Next sequence for packets to tap
+ * @seq_ack_from_tap: Last ACK number received from tap
+ * @seq_from_tap: Next sequence for packets from tap (not actually sent)
+ * @seq_ack_to_tap: Last ACK number sent to tap
+ * @seq_init_from_tap: Initial sequence number from tap
+*/
+struct tcp_tap_transfer {
+ uint8_t pif[SIDES];
+ struct flowside side[SIDES];
+
+ uint8_t retrans;
+ uint8_t ws_from_tap;
+ uint8_t ws_to_tap;
+ uint8_t events;
+
+ uint32_t tap_mss;
+
+ uint32_t sndbuf;
+
+ uint8_t flags;
+ uint8_t seq_dup_ack_approx;
+
+ uint16_t wnd_from_tap;
+ uint16_t wnd_to_tap;
+
+ uint32_t seq_to_tap;
+ uint32_t seq_ack_from_tap;
+ uint32_t seq_from_tap;
+ uint32_t seq_ack_to_tap;
+ uint32_t seq_init_from_tap;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
+/**
+ * struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order
+ * @seq_snd: Socket-side send sequence
+ * @seq_rcv: Socket-side receive sequence
+ * @sndq: Length of pending send queue (unacknowledged / not sent)
+ * @notsent: Part of pending send queue that wasn't sent out yet
+ * @rcvq: Length of pending receive queue
+ * @mss: Socket-side MSS clamp
+ * @snd_wl1: Next sequence used in window probe (next sequence - 1)
+ * @snd_wnd: Socket-side sending window
+ * @max_window: Window clamp
+ * @rcv_wnd: Socket-side receive window
+ * @rcv_wup: rcv_nxt on last window update sent
+ * @snd_ws: Window scaling factor, send
+ * @rcv_ws: Window scaling factor, receive
+ * @tcpi_state: Connection state in TCP_INFO style (enum, tcp_states.h)
+ * @tcpi_options: TCPI_OPT_* constants (timestamps, selective ACK)
+ */
+struct tcp_tap_transfer_ext {
+ uint32_t seq_snd;
+ uint32_t seq_rcv;
+
+ uint32_t sndq;
+ uint32_t notsent;
+ uint32_t rcvq;
+
+ uint32_t mss;
+
+ /* We can't just use struct tcp_repair_window: we need network order */
+ uint32_t snd_wl1;
+ uint32_t snd_wnd;
+ uint32_t max_window;
+ uint32_t rcv_wnd;
+ uint32_t rcv_wup;
+
+ uint8_t snd_ws;
+ uint8_t rcv_ws;
+ uint8_t tcpi_state;
+ uint8_t tcpi_options;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
/**
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
* @f: Generic flow information
@@ -140,6 +229,16 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
+
+int tcp_freeze(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_thaw(struct ctx *c, struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_target(struct ctx *c, int fd);
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
+
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
--
@@ -19,6 +19,7 @@
* @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
* @sock: Socket descriptor number
* @events: Connection events, implying connection states
+ * @listening_sock: Listening socket this socket was accept()ed from, or -1
* @timer: timerfd descriptor for timeout events
* @flags: Connection flags representing internal attributes
* @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
@@ -68,6 +69,7 @@ struct tcp_tap_conn {
#define CONN_STATE_BITS /* Setting these clears other flags */ \
(SOCK_ACCEPTED | TAP_SYN_RCVD | ESTABLISHED)
+ int listening_sock;
int timer :FD_REF_BITS;
@@ -96,6 +98,93 @@ struct tcp_tap_conn {
uint32_t seq_init_from_tap;
};
+/**
+ * struct tcp_tap_transfer - Migrated TCP data, flow table part, network order
+ * @pif: Interfaces for each side of the flow
+ * @side: Addresses and ports for each side of the flow
+ * @retrans: Number of retransmissions occurred due to ACK_TIMEOUT
+ * @ws_from_tap: Window scaling factor advertised from tap/guest
+ * @ws_to_tap: Window scaling factor advertised to tap/guest
+ * @events: Connection events, implying connection states
+ * @tap_mss: MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
+ * @sndbuf: Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
+ * @flags: Connection flags representing internal attributes
+ * @seq_dup_ack_approx: Last duplicate ACK number sent to tap
+ * @wnd_from_tap: Last window size from tap, unscaled (as received)
+ * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent)
+ * @seq_to_tap: Next sequence for packets to tap
+ * @seq_ack_from_tap: Last ACK number received from tap
+ * @seq_from_tap: Next sequence for packets from tap (not actually sent)
+ * @seq_ack_to_tap: Last ACK number sent to tap
+ * @seq_init_from_tap: Initial sequence number from tap
+*/
+struct tcp_tap_transfer {
+ uint8_t pif[SIDES];
+ struct flowside side[SIDES];
+
+ uint8_t retrans;
+ uint8_t ws_from_tap;
+ uint8_t ws_to_tap;
+ uint8_t events;
+
+ uint32_t tap_mss;
+
+ uint32_t sndbuf;
+
+ uint8_t flags;
+ uint8_t seq_dup_ack_approx;
+
+ uint16_t wnd_from_tap;
+ uint16_t wnd_to_tap;
+
+ uint32_t seq_to_tap;
+ uint32_t seq_ack_from_tap;
+ uint32_t seq_from_tap;
+ uint32_t seq_ack_to_tap;
+ uint32_t seq_init_from_tap;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
+/**
+ * struct tcp_tap_transfer_ext - Migrated TCP data, outside flow, network order
+ * @seq_snd: Socket-side send sequence
+ * @seq_rcv: Socket-side receive sequence
+ * @sndq: Length of pending send queue (unacknowledged / not sent)
+ * @notsent: Part of pending send queue that wasn't sent out yet
+ * @rcvq: Length of pending receive queue
+ * @mss: Socket-side MSS clamp
+ * @snd_wl1: Next sequence used in window probe (next sequence - 1)
+ * @snd_wnd: Socket-side sending window
+ * @max_window: Window clamp
+ * @rcv_wnd: Socket-side receive window
+ * @rcv_wup: rcv_nxt on last window update sent
+ * @snd_ws: Window scaling factor, send
+ * @rcv_ws: Window scaling factor, receive
+ * @tcpi_state: Connection state in TCP_INFO style (enum, tcp_states.h)
+ * @tcpi_options: TCPI_OPT_* constants (timestamps, selective ACK)
+ */
+struct tcp_tap_transfer_ext {
+ uint32_t seq_snd;
+ uint32_t seq_rcv;
+
+ uint32_t sndq;
+ uint32_t notsent;
+ uint32_t rcvq;
+
+ uint32_t mss;
+
+ /* We can't just use struct tcp_repair_window: we need network order */
+ uint32_t snd_wl1;
+ uint32_t snd_wnd;
+ uint32_t max_window;
+ uint32_t rcv_wnd;
+ uint32_t rcv_wup;
+
+ uint8_t snd_ws;
+ uint8_t rcv_ws;
+ uint8_t tcpi_state;
+ uint8_t tcpi_options;
+} __attribute__((packed, aligned(__alignof__(uint32_t))));
+
/**
* struct tcp_splice_conn - Descriptor for a spliced TCP connection
* @f: Generic flow information
@@ -140,6 +229,16 @@ extern int init_sock_pool4 [TCP_SOCK_POOL_SIZE];
extern int init_sock_pool6 [TCP_SOCK_POOL_SIZE];
bool tcp_flow_defer(const struct tcp_tap_conn *conn);
+
+int tcp_freeze(struct ctx *c, const struct tcp_tap_conn *conn);
+int tcp_thaw(struct ctx *c, struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_source(int fd, struct tcp_tap_conn *conn);
+int tcp_flow_migrate_source_ext(int fd, const struct tcp_tap_conn *conn);
+
+int tcp_flow_migrate_target(struct ctx *c, int fd);
+int tcp_flow_migrate_target_ext(struct ctx *c, union flow *flow, int fd);
+
bool tcp_splice_flow_defer(struct tcp_splice_conn *conn);
void tcp_splice_timer(const struct ctx *c, struct tcp_splice_conn *conn);
int tcp_conn_pool_sock(int pool[]);
--
2.48.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH v23 4/4] test: Add migration tests
2025-02-14 9:08 [PATCH v23 0/4] State migration David Gibson
` (2 preceding siblings ...)
2025-02-14 9:08 ` [PATCH v23 3/4] migrate: Migrate TCP flows David Gibson
@ 2025-02-14 9:08 ` David Gibson
3 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2025-02-14 9:08 UTC (permalink / raw)
To: Stefano Brivio, passt-dev; +Cc: David Gibson
From: Stefano Brivio <sbrivio@redhat.com>
PCAP=1 ./run migrate/basic is oddly satisfying.
PCAP=1 ./run migrate/bidirectional and migrate/iperf3_out4 are even
better.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
test/lib/layout | 55 +++++++++++++-
test/lib/setup | 138 +++++++++++++++++++++++++++++++++++-
test/lib/test | 48 +++++++++++++
test/migrate/basic | 59 +++++++++++++++
test/migrate/bidirectional | 64 +++++++++++++++++
test/migrate/iperf3_bidir6 | 58 +++++++++++++++
test/migrate/iperf3_in4 | 50 +++++++++++++
test/migrate/iperf3_in6 | 58 +++++++++++++++
test/migrate/iperf3_out4 | 50 +++++++++++++
test/migrate/iperf3_out6 | 58 +++++++++++++++
test/migrate/rampstream_in | 10 +--
test/migrate/rampstream_out | 6 +-
test/run | 34 ++++++++-
13 files changed, 677 insertions(+), 11 deletions(-)
create mode 100644 test/migrate/basic
create mode 100644 test/migrate/bidirectional
create mode 100644 test/migrate/iperf3_bidir6
create mode 100644 test/migrate/iperf3_in4
create mode 100644 test/migrate/iperf3_in6
create mode 100644 test/migrate/iperf3_out4
create mode 100644 test/migrate/iperf3_out6
diff --git a/test/lib/layout b/test/lib/layout
index 4d035728..fddcdc4a 100644
--- a/test/lib/layout
+++ b/test/lib/layout
@@ -134,6 +134,54 @@ layout_two_guests() {
get_info_cols
+ pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
+ pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #1" qemu_2 guest_2
+
+ tmux send-keys -l -t ${PANE_INFO} 'while cat '"$STATEBASE/log_pipe"'; do :; done'
+ tmux send-keys -t ${PANE_INFO} -N 100 C-m
+ tmux select-pane -t ${PANE_INFO} -T "test log"
+
+ pane_watch_contexts ${PANE_HOST} host host
+ pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
+ pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #1" pasta_1 passt_2
+
+ info_layout "two guests, two passt instances, in namespaces"
+
+ sleep 1
+}
+
+# layout_migrate() - Two guest panes, two passt panes, two passt-repair panes,
+# plus host and log
+layout_migrate() {
+ sleep 1
+
+ tmux kill-pane -a -t 0
+ cmd_write 0 clear
+
+ tmux split-window -v -t passt_test
+ tmux split-window -h -l '33%'
+ tmux split-window -h -t passt_test:1.1
+
+ tmux split-window -h -l '35%' -t passt_test:1.0
+ tmux split-window -v -t passt_test:1.0
+
+ tmux split-window -v -t passt_test:1.4
+ tmux split-window -v -t passt_test:1.6
+
+ tmux split-window -v -t passt_test:1.3
+
+ PANE_GUEST_1=0
+ PANE_GUEST_2=1
+ PANE_INFO=2
+ PANE_MON=3
+ PANE_HOST=4
+ PANE_PASST_REPAIR_1=5
+ PANE_PASST_1=6
+ PANE_PASST_REPAIR_2=7
+ PANE_PASST_2=8
+
+ get_info_cols
+
pane_watch_contexts ${PANE_GUEST_1} "guest #1 in namespace #1" qemu_1 guest_1
pane_watch_contexts ${PANE_GUEST_2} "guest #2 in namespace #2" qemu_2 guest_2
@@ -141,11 +189,16 @@ layout_two_guests() {
tmux send-keys -t ${PANE_INFO} -N 100 C-m
tmux select-pane -t ${PANE_INFO} -T "test log"
+ pane_watch_contexts ${PANE_MON} "QEMU monitor" mon mon
+
pane_watch_contexts ${PANE_HOST} host host
+ pane_watch_contexts ${PANE_PASST_REPAIR_1} "passt-repair #1 in namespace #1" repair_1 passt_repair_1
pane_watch_contexts ${PANE_PASST_1} "passt #1 in namespace #1" pasta_1 passt_1
+
+ pane_watch_contexts ${PANE_PASST_REPAIR_2} "passt-repair #2 in namespace #2" repair_2 passt_repair_2
pane_watch_contexts ${PANE_PASST_2} "passt #2 in namespace #2" pasta_2 passt_2
- info_layout "two guests, two passt instances, in namespaces"
+ info_layout "two guests, two passt + passt-repair instances, in namespaces"
sleep 1
}
diff --git a/test/lib/setup b/test/lib/setup
index ee671520..575bc215 100755
--- a/test/lib/setup
+++ b/test/lib/setup
@@ -305,6 +305,117 @@ setup_two_guests() {
context_setup_guest guest_2 ${GUEST_2_CID}
}
+# setup_migrate() - Set up two namespace, run qemu, passt/passt-repair in both
+setup_migrate() {
+ context_setup_host host
+ context_setup_host mon
+ context_setup_host pasta_1
+ context_setup_host pasta_2
+
+ layout_migrate
+
+ # Ports:
+ #
+ # guest #1 | guest #2 | ns #1 | host
+ # --------- |-----------|-----------|------------
+ # 10001 as server | | to guest | to ns #1
+ # 10002 | | as server | to ns #1
+ # 10003 | | to init | as server
+ # 10004 | as server | to guest | to ns #1
+
+ __opts=
+ [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/pasta_1.pcap"
+ [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
+ [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+
+ __map_host4=192.0.2.1
+ __map_host6=2001:db8:9a55::1
+ __map_ns4=192.0.2.2
+ __map_ns6=2001:db8:9a55::2
+
+ # Option 1: send stuff via spliced path in pasta
+ # context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002 -T 10003 -u 10001,10002 -U 10003 --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
+ # Option 2: send stuff via tap (--map-guest-addr) instead (useful to see capture of full migration)
+ context_run_bg pasta_1 "./pasta ${__opts} -P ${STATESETUP}/pasta_1.pid -t 10001,10002,10004 -T 10003 -u 10001,10002,10004 -U 10003 --map-guest-addr ${__map_host4} --map-guest-addr ${__map_host6} --config-net ${NSTOOL} hold ${STATESETUP}/ns1.hold"
+ context_setup_nstool passt_1 ${STATESETUP}/ns1.hold
+ context_setup_nstool passt_repair_1 ${STATESETUP}/ns1.hold
+
+ context_setup_nstool passt_2 ${STATESETUP}/ns1.hold
+ context_setup_nstool passt_repair_2 ${STATESETUP}/ns1.hold
+
+ context_setup_nstool qemu_1 ${STATESETUP}/ns1.hold
+ context_setup_nstool qemu_2 ${STATESETUP}/ns1.hold
+
+ __ifname="$(context_run qemu_1 "ip -j link show | jq -rM '.[] | select(.link_type == \"ether\").ifname'")"
+
+ sleep 1
+
+ __opts="--vhost-user"
+ [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_1.pcap"
+ [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
+ [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+
+ context_run_bg passt_1 "./passt -s ${STATESETUP}/passt_1.socket -P ${STATESETUP}/passt_1.pid -f ${__opts} -t 10001 -u 10001"
+ wait_for [ -f "${STATESETUP}/passt_1.pid" ]
+
+ context_run_bg passt_repair_1 "./passt-repair ${STATESETUP}/passt_1.socket.repair"
+
+ __opts="--vhost-user"
+ [ ${PCAP} -eq 1 ] && __opts="${__opts} -p ${LOGDIR}/passt_2.pcap"
+ [ ${DEBUG} -eq 1 ] && __opts="${__opts} -d"
+ [ ${TRACE} -eq 1 ] && __opts="${__opts} --trace"
+
+ context_run_bg passt_2 "./passt -s ${STATESETUP}/passt_2.socket -P ${STATESETUP}/passt_2.pid -f ${__opts} -t 10004 -u 10004"
+ wait_for [ -f "${STATESETUP}/passt_2.pid" ]
+
+ context_run_bg passt_repair_2 "./passt-repair ${STATESETUP}/passt_2.socket.repair"
+
+ __vmem="512M" # Keep migration fast
+ __qemu_netdev1=" \
+ -chardev socket,id=c,path=${STATESETUP}/passt_1.socket \
+ -netdev vhost-user,id=v,chardev=c \
+ -device virtio-net,netdev=v \
+ -object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+ -numa node,memdev=m"
+ __qemu_netdev2=" \
+ -chardev socket,id=c,path=${STATESETUP}/passt_2.socket \
+ -netdev vhost-user,id=v,chardev=c \
+ -device virtio-net,netdev=v \
+ -object memory-backend-memfd,id=m,share=on,size=${__vmem} \
+ -numa node,memdev=m"
+
+ GUEST_1_CID=94557
+ context_run_bg qemu_1 'qemu-system-'"${QEMU_ARCH}" \
+ ' -M accel=kvm:tcg' \
+ ' -m '${__vmem}' -cpu host -smp '${VCPUS} \
+ ' -kernel '"${KERNEL}" \
+ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \
+ ' -nodefaults' \
+ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \
+ " ${__qemu_netdev1}" \
+ " -pidfile ${STATESETUP}/qemu_1.pid" \
+ " -device vhost-vsock-pci,guest-cid=$GUEST_1_CID" \
+ " -monitor unix:${STATESETUP}/qemu_1_mon.sock,server,nowait"
+
+ GUEST_2_CID=94558
+ context_run_bg qemu_2 'qemu-system-'"${QEMU_ARCH}" \
+ ' -M accel=kvm:tcg' \
+ ' -m '${__vmem}' -cpu host -smp '${VCPUS} \
+ ' -kernel '"${KERNEL}" \
+ ' -initrd '${INITRAMFS}' -nographic -serial stdio' \
+ ' -nodefaults' \
+ ' -append "console=ttyS0 mitigations=off apparmor=0" ' \
+ " ${__qemu_netdev2}" \
+ " -pidfile ${STATESETUP}/qemu_2.pid" \
+ " -device vhost-vsock-pci,guest-cid=$GUEST_2_CID" \
+ " -monitor unix:${STATESETUP}/qemu_2_mon.sock,server,nowait" \
+ " -incoming tcp:0:20005"
+
+ context_setup_guest guest_1 ${GUEST_1_CID}
+ # Only available after migration:
+ ( context_setup_guest guest_2 ${GUEST_2_CID} & )
+}
+
# teardown_context_watch() - Remove contexts and stop panes watching them
# $1: Pane number watching
# $@: Context names
@@ -375,7 +486,8 @@ teardown_two_guests() {
context_wait pasta_1
context_wait pasta_2
- rm -f "${STATESETUP}/passt__[12].pid" "${STATESETUP}/pasta_[12].pid"
+ rm "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
+ rm "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
teardown_context_watch ${PANE_HOST} host
teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
@@ -384,6 +496,30 @@ teardown_two_guests() {
teardown_context_watch ${PANE_PASST_2} pasta_2 passt_2
}
+# teardown_migrate() - Exit namespaces, kill qemu processes, passt and pasta
+teardown_migrate() {
+ ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_1.pid")
+ ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/qemu_2.pid")
+ context_wait qemu_1
+ context_wait qemu_2
+
+ ${NSTOOL} exec ${STATESETUP}/ns1.hold -- kill $(cat "${STATESETUP}/passt_2.pid")
+ context_wait passt_1
+ context_wait passt_2
+ ${NSTOOL} stop "${STATESETUP}/ns1.hold"
+ context_wait pasta_1
+
+ rm -f "${STATESETUP}/passt_1.pid" "${STATESETUP}/passt_2.pid"
+ rm -f "${STATESETUP}/pasta_1.pid" "${STATESETUP}/pasta_2.pid"
+
+ teardown_context_watch ${PANE_HOST} host
+
+ teardown_context_watch ${PANE_GUEST_1} qemu_1 guest_1
+ teardown_context_watch ${PANE_GUEST_2} qemu_2 guest_2
+ teardown_context_watch ${PANE_PASST_1} pasta_1 passt_1
+ teardown_context_watch ${PANE_PASST_2} pasta_1 passt_2
+}
+
# teardown_demo_passt() - Exit namespace, kill qemu, passt and pasta
teardown_demo_passt() {
tmux send-keys -t ${PANE_GUEST} "C-c"
diff --git a/test/lib/test b/test/lib/test
index 91729af7..8f88567d 100755
--- a/test/lib/test
+++ b/test/lib/test
@@ -68,6 +68,45 @@ test_iperf3() {
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
}
+# test_iperf3m() - Ugly helper for iperf3 directive, guest migration variant
+# $1: Variable name: to put the measure bandwidth into
+# $2: Initial source/client context
+# $3: Second source/client context the guest is moving to
+# $4: Destination name or address for client
+# $5: Port number, ${i} is translated to process index
+# $6: Run time, in seconds
+# $7: Client options
+test_iperf3m() {
+ __var="${1}"; shift
+ __cctx="${1}"; shift
+ __cctx2="${1}"; shift
+ __dest="${1}"; shift
+ __port="${1}"; shift
+ __time="${1}"; shift
+
+ pane_or_context_run "${__cctx}" 'rm -f c.json'
+
+ # A 1s wait for connection on what's basically a local link
+ # indicates something is pretty wrong
+ __timeout=1000
+ pane_or_context_run_bg "${__cctx}" \
+ 'iperf3 -J -c '${__dest}' -p '${__port} \
+ ' --connect-timeout '${__timeout} \
+ ' -t'${__time}' -i0 '"${@}"' > c.json' \
+
+ __jval=".end.sum_received.bits_per_second"
+
+ sleep $((${__time} + 3))
+
+ pane_or_context_output "${__cctx2}" \
+ 'cat c.json'
+
+ __bw=$(pane_or_context_output "${__cctx2}" \
+ 'cat c.json | jq -rMs "map('${__jval}') | add"')
+
+ TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__var}__" "${__bw}" )"
+}
+
test_one_line() {
__line="${1}"
@@ -177,6 +216,12 @@ test_one_line() {
"guest2w")
pane_or_context_wait guest_2 || TEST_ONE_nok=1
;;
+ "mon")
+ pane_or_context_run mon "${__arg}" || TEST_ONE_nok=1
+ ;;
+ "monb")
+ pane_or_context_run_bg mon "${__arg}"
+ ;;
"ns")
pane_or_context_run ns "${__arg}" || TEST_ONE_nok=1
;;
@@ -292,6 +337,9 @@ test_one_line() {
"iperf3")
test_iperf3 ${__arg}
;;
+ "iperf3m")
+ test_iperf3m ${__arg}
+ ;;
"set")
TEST_ONE_subs="$(list_add_pair "${TEST_ONE_subs}" "__${__arg%% *}__" "${__arg#* }")"
;;
diff --git a/test/migrate/basic b/test/migrate/basic
new file mode 100644
index 00000000..3f11f7d8
--- /dev/null
+++ b/test/migrate/basic
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/migrate/basic - Check basic migration functionality
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools ip jq dhclient socat cat
+htools ip jq
+
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
+test Interface name
+g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check [ -n "__IFNAME1__" ]
+
+test DHCP: address
+guest1 ip link set dev __IFNAME1__ up
+guest1 /sbin/dhclient -4 __IFNAME1__
+g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check [ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1 /sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test TCP/IPv4: guest1/guest2 > host
+g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
+sleep 1
+# Option 1: via spliced path in pasta, namespace to host
+# guest1b { printf "Hello from guest 1"; sleep 10; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__GW1__:10003
+# Option 2: via --map-guest-addr (tap) in pasta, namespace to host
+guest1b { printf "Hello from guest 1"; sleep 3; printf " and from guest 2\n"; } | socat -u STDIN TCP4:__MAP_HOST4__:10006
+sleep 1
+
+mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+hostw
+hout MSG cat __STATESETUP__/msg
+check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]
diff --git a/test/migrate/bidirectional b/test/migrate/bidirectional
new file mode 100644
index 00000000..4c040818
--- /dev/null
+++ b/test/migrate/bidirectional
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/migrate/bidirectional - Check migration with messages in both directions
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools ip jq dhclient socat cat
+htools ip jq
+
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
+test Interface name
+g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check [ -n "__IFNAME1__" ]
+
+test DHCP: address
+guest1 ip link set dev __IFNAME1__ up
+guest1 /sbin/dhclient -4 __IFNAME1__
+g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check [ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test TCP/IPv4: guest1/guest2 > host, host > guest1/guest2
+g1out GW1 ip -j -4 route show|jq -rM '.[] | select(.dst == "default").gateway'
+
+hostb socat -u TCP4-LISTEN:10006 OPEN:__STATESETUP__/msg,create,trunc
+guest1b socat -u TCP4-LISTEN:10001 OPEN:msg,create,trunc
+sleep 1
+
+guest1b socat -u UNIX-RECV:proxy.sock,null-eof TCP4:__MAP_HOST4__:10006
+hostb socat -u UNIX-RECV:__STATESETUP__/proxy.sock,null-eof TCP4:__ADDR1__:10001
+sleep 1
+guest1 printf "Hello from guest 1" | socat -u STDIN UNIX:proxy.sock
+host printf "Dear guest 1," | socat -u STDIN UNIX:__STATESETUP__/proxy.sock
+sleep 1
+
+mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+sleep 2
+guest2 printf " and from guest 2" | socat -u STDIN UNIX:proxy.sock,shut-null
+host printf " you are now guest 2" | socat -u STDIN UNIX:__STATESETUP__/proxy.sock,shut-null
+
+hostw
+# FIXME: guest2w doesn't work here because shell jobs are (also) from guest #1,
+# use sleep 1 for the moment
+sleep 1
+
+hout MSG cat __STATESETUP__/msg
+check [ "__MSG__" = "Hello from guest 1 and from guest 2" ]
+
+g2out MSG cat msg
+check [ "__MSG__" = "Dear guest 1, you are now guest 2" ]
diff --git a/test/migrate/iperf3_bidir6 b/test/migrate/iperf3_bidir6
new file mode 100644
index 00000000..9cb198c8
--- /dev/null
+++ b/test/migrate/iperf3_bidir6
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/migrate/iperf3_bidir6 - Migration behaviour with many flows
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools ip jq dhclient socat cat
+htools ip jq
+
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
+set THREADS 128
+set TIME 3
+set OMIT 0.1
+set OPTS -Z -P __THREADS__ -O__OMIT__ -N --bidir
+
+test Interface name
+g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check [ -n "__IFNAME1__" ]
+
+test DHCP: address
+guest1 ip link set dev __IFNAME1__ up
+guest1 /sbin/dhclient -4 __IFNAME1__
+g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check [ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1 /sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test TCP/IPv6 host to guest throughput during migration
+
+monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s host 10006
+iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
+bw __BW__ 1 2
+
+iperf3k host
diff --git a/test/migrate/iperf3_in4 b/test/migrate/iperf3_in4
new file mode 100644
index 00000000..1fd854d9
--- /dev/null
+++ b/test/migrate/iperf3_in4
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/migrate/iperf3_out4 - Migration behaviour under outbound IPv4 flood
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools ip jq dhclient socat cat
+htools ip jq
+
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
+guest1 /sbin/sysctl -w net.core.rmem_max=33554432
+guest1 /sbin/sysctl -w net.core.wmem_max=33554432
+
+set THREADS 1
+set TIME 4
+set OMIT 0.1
+set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
+
+test Interface name
+g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check [ -n "__IFNAME1__" ]
+
+test DHCP: address
+guest1 ip link set dev __IFNAME1__ up
+guest1 /sbin/dhclient -4 __IFNAME1__
+g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check [ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test TCP/IPv4 host to guest throughput during migration
+
+monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s host 10006
+iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__
+bw __BW__ 1 2
+
+iperf3k host
diff --git a/test/migrate/iperf3_in6 b/test/migrate/iperf3_in6
new file mode 100644
index 00000000..b727929c
--- /dev/null
+++ b/test/migrate/iperf3_in6
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/migrate/iperf3_out6 - Migration behaviour under outbound IPv6 flood
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools ip jq dhclient socat cat
+htools ip jq
+
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
+set THREADS 4
+set TIME 3
+set OMIT 0.1
+set OPTS -Z -P __THREADS__ -O__OMIT__ -N -R
+
+test Interface name
+g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check [ -n "__IFNAME1__" ]
+
+test DHCP: address
+guest1 ip link set dev __IFNAME1__ up
+guest1 /sbin/dhclient -4 __IFNAME1__
+g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check [ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1 /sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test TCP/IPv6 host to guest throughput during migration
+
+monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s host 10006
+iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__
+bw __BW__ 1 2
+
+iperf3k host
diff --git a/test/migrate/iperf3_out4 b/test/migrate/iperf3_out4
new file mode 100644
index 00000000..0122684c
--- /dev/null
+++ b/test/migrate/iperf3_out4
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/migrate/iperf3_out4 - Migration behaviour under outbound IPv4 flood
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools ip jq dhclient socat cat
+htools ip jq
+
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
+guest1 /sbin/sysctl -w net.core.rmem_max=33554432
+guest1 /sbin/sysctl -w net.core.wmem_max=33554432
+
+set THREADS 2
+set TIME 3
+set OMIT 0.1
+set OPTS -Z -P __THREADS__ -O__OMIT__ -N
+
+test Interface name
+g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check [ -n "__IFNAME1__" ]
+
+test DHCP: address
+guest1 ip link set dev __IFNAME1__ up
+guest1 /sbin/dhclient -4 __IFNAME1__
+g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check [ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test TCP/IPv4 guest to host throughput during migration
+
+monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s host 10006
+iperf3m BW guest_1 guest_2 __MAP_HOST4__ 10006 __TIME__ __OPTS__ -l 1M
+bw __BW__ 1 2
+
+iperf3k host
diff --git a/test/migrate/iperf3_out6 b/test/migrate/iperf3_out6
new file mode 100644
index 00000000..9cb13b0c
--- /dev/null
+++ b/test/migrate/iperf3_out6
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# PASST - Plug A Simple Socket Transport
+# for qemu/UNIX domain socket mode
+#
+# PASTA - Pack A Subtle Tap Abstraction
+# for network namespace/tap device mode
+#
+# test/migrate/iperf3_out6 - Migration behaviour under outbound IPv6 flood
+#
+# Copyright (c) 2025 Red Hat GmbH
+# Author: Stefano Brivio <sbrivio@redhat.com>
+
+g1tools ip jq dhclient socat cat
+htools ip jq
+
+set MAP_HOST4 192.0.2.1
+set MAP_HOST6 2001:db8:9a55::1
+set MAP_NS4 192.0.2.2
+set MAP_NS6 2001:db8:9a55::2
+
+set THREADS 1
+set TIME 3
+set OMIT 0.1
+set OPTS -Z -P __THREADS__ -O__OMIT__ -N
+
+test Interface name
+g1out IFNAME1 ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+hout HOST_IFNAME ip -j -4 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+hout HOST_IFNAME6 ip -j -6 route show|jq -rM '[.[] | select(.dst == "default").dev] | .[0]'
+check [ -n "__IFNAME1__" ]
+
+test DHCP: address
+guest1 ip link set dev __IFNAME1__ up
+guest1 /sbin/dhclient -4 __IFNAME1__
+g1out ADDR1 ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__IFNAME1__").addr_info[0].local'
+hout HOST_ADDR ip -j -4 addr show|jq -rM '.[] | select(.ifname == "__HOST_IFNAME__").addr_info[0].local'
+check [ "__ADDR1__" = "__HOST_ADDR__" ]
+
+test DHCPv6: address
+# Link is up now, wait for DAD to complete
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+guest1 /sbin/dhclient -6 __IFNAME1__
+# Wait for DAD to complete on the DHCP address
+guest1 while ip -j -6 addr show tentative | jq -e '.[].addr_info'; do sleep 0.1; done
+g1out ADDR1_6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__IFNAME1__").addr_info[] | select(.prefixlen == 128).local] | .[0]'
+hout HOST_ADDR6 ip -j -6 addr show|jq -rM '[.[] | select(.ifname == "__HOST_IFNAME6__").addr_info[] | select(.scope == "global" and .deprecated != true).local] | .[0]'
+check [ "__ADDR1_6__" = "__HOST_ADDR6__" ]
+
+test TCP/IPv6 guest to host throughput during migration
+
+monb sleep 1; echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+
+iperf3s host 10006
+iperf3m BW guest_1 guest_2 __MAP_HOST6__ 10006 __TIME__ __OPTS__ -l 1M
+bw __BW__ 1 2
+
+iperf3k host
diff --git a/test/migrate/rampstream_in b/test/migrate/rampstream_in
index 46f41431..74223da0 100644
--- a/test/migrate/rampstream_in
+++ b/test/migrate/rampstream_in
@@ -6,10 +6,10 @@
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
-# test/migrate/basic - Check basic migration functionality
+# test/migrate/rampstream_in - Check sequence correctness with inbound ramp
#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
+# Copyright (c) 2025 Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
g1tools ip jq dhclient socat cat
htools ip jq
@@ -49,9 +49,9 @@ guest1b socat -u TCP4-LISTEN:10001 EXEC:"rampstream-check.sh __RAMPS__"
sleep 1
hostb socat -u EXEC:"test/rampstream send __RAMPS__" TCP4:__ADDR1__:10001
-sleep 1
+sleep 1
-#mon echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
+monb echo "migrate tcp:0:20005" | socat -u STDIN UNIX:__STATESETUP__/qemu_1_mon.sock
hostw
diff --git a/test/migrate/rampstream_out b/test/migrate/rampstream_out
index 91b9c631..85dd5482 100644
--- a/test/migrate/rampstream_out
+++ b/test/migrate/rampstream_out
@@ -6,10 +6,10 @@
# PASTA - Pack A Subtle Tap Abstraction
# for network namespace/tap device mode
#
-# test/migrate/basic - Check basic migration functionality
+# test/migrate/rampstream_out - Check sequence correctness with outbound ramp
#
-# Copyright (c) 2025 Red Hat GmbH
-# Author: Stefano Brivio <sbrivio@redhat.com>
+# Copyright (c) 2025 Red Hat
+# Author: David Gibson <david@gibson.dropbear.id.au>
g1tools ip jq dhclient socat cat
htools ip jq
diff --git a/test/run b/test/run
index fc710475..080cbd6b 100755
--- a/test/run
+++ b/test/run
@@ -73,6 +73,7 @@ run() {
test build/clang_tidy
teardown build
+
setup pasta
test pasta/ndp
test pasta/dhcp
@@ -130,6 +131,34 @@ run() {
test two_guests_vu/basic
teardown two_guests
+ setup migrate
+ test migrate/basic
+ teardown migrate
+ setup migrate
+ test migrate/bidirectional
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_out4
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_out6
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_in4
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_in6
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_bidir6
+ teardown migrate
+ setup migrate
+ test migrate/rampstream_in
+ teardown migrate
+ setup migrate
+ test migrate/rampstream_out
+ teardown migrate
+
VALGRIND=0
VHOST_USER=0
#setup passt_in_ns
@@ -186,7 +215,10 @@ run_selected() {
__setup=
for __test; do
- if [ "${__test%%/*}" != "${__setup}" ]; then
+ # HACK: the migrate tests need the setup repeated for
+ # each test
+ if [ "${__test%%/*}" != "${__setup}" -o \
+ "${__test%%/*}" = "migrate" ]; then
[ -n "${__setup}" ] && teardown "${__setup}"
__setup="${__test%%/*}"
setup "${__setup}"
--
@@ -73,6 +73,7 @@ run() {
test build/clang_tidy
teardown build
+
setup pasta
test pasta/ndp
test pasta/dhcp
@@ -130,6 +131,34 @@ run() {
test two_guests_vu/basic
teardown two_guests
+ setup migrate
+ test migrate/basic
+ teardown migrate
+ setup migrate
+ test migrate/bidirectional
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_out4
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_out6
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_in4
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_in6
+ teardown migrate
+ setup migrate
+ test migrate/iperf3_bidir6
+ teardown migrate
+ setup migrate
+ test migrate/rampstream_in
+ teardown migrate
+ setup migrate
+ test migrate/rampstream_out
+ teardown migrate
+
VALGRIND=0
VHOST_USER=0
#setup passt_in_ns
@@ -186,7 +215,10 @@ run_selected() {
__setup=
for __test; do
- if [ "${__test%%/*}" != "${__setup}" ]; then
+ # HACK: the migrate tests need the setup repeated for
+ # each test
+ if [ "${__test%%/*}" != "${__setup}" -o \
+ "${__test%%/*}" = "migrate" ]; then
[ -n "${__setup}" ] && teardown "${__setup}"
__setup="${__test%%/*}"
setup "${__setup}"
--
2.48.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
end of thread, other threads:[~2025-02-14 9:08 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-02-14 9:08 [PATCH v23 0/4] State migration David Gibson
2025-02-14 9:08 ` [PATCH v23 1/4] flow: Flow table traversing macros David Gibson
2025-02-14 9:08 ` [PATCH v23 2/4] flow, migrate: Flow migration skeleton David Gibson
2025-02-14 9:08 ` [PATCH v23 3/4] migrate: Migrate TCP flows David Gibson
2025-02-14 9:08 ` [PATCH v23 4/4] test: Add migration tests David Gibson
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).