From: David Gibson <david@gibson.dropbear.id.au>
To: Stefano Brivio <sbrivio@redhat.com>, passt-dev@passt.top
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v5 01/19] flow: Clarify and enforce flow state transitions
Date: Tue, 14 May 2024 11:03:19 +1000 [thread overview]
Message-ID: <20240514010337.1104606-2-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20240514010337.1104606-1-david@gibson.dropbear.id.au>
Flows move over several different states in their lifetime. The rules for
these are documented in comments, but they're pretty complex and a number
of the transitions are implicit, which makes this pretty fragile and
error prone.
Change the code to explicitly track the states in a field. Make all
transitions explicit and logged. To the extent that it's practical in C,
enforce what can and can't be done in various states with ASSERT()s.
While we're at it, tweak the docs to clarify the restrictions on each state
a bit.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
flow.c | 144 ++++++++++++++++++++++++++++++---------------------
flow.h | 67 ++++++++++++++++++++++--
flow_table.h | 10 ++++
icmp.c | 4 +-
tcp.c | 8 ++-
tcp_splice.c | 4 +-
6 files changed, 168 insertions(+), 69 deletions(-)
diff --git a/flow.c b/flow.c
index 80dd269..768e0f6 100644
--- a/flow.c
+++ b/flow.c
@@ -18,6 +18,15 @@
#include "flow.h"
#include "flow_table.h"
+const char *flow_state_str[] = {
+ [FLOW_STATE_FREE] = "FREE",
+ [FLOW_STATE_NEW] = "NEW",
+ [FLOW_STATE_TYPED] = "TYPED",
+ [FLOW_STATE_ACTIVE] = "ACTIVE",
+};
+static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES,
+ "flow_state_str[] doesn't match enum flow_state");
+
const char *flow_type_str[] = {
[FLOW_TYPE_NONE] = "<none>",
[FLOW_TCP] = "TCP connection",
@@ -39,46 +48,6 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
/* Global Flow Table */
-/**
- * DOC: Theory of Operation - flow entry life cycle
- *
- * An individual flow table entry moves through these logical states, usually in
- * this order.
- *
- * FREE - Part of the general pool of free flow table entries
- * Operations:
- * - flow_alloc() finds an entry and moves it to ALLOC state
- *
- * ALLOC - A tentatively allocated entry
- * Operations:
- * - flow_alloc_cancel() returns the entry to FREE state
- * - FLOW_START() set the entry's type and moves to START state
- * Caveats:
- * - It's not safe to write fields in the flow entry
- * - It's not safe to allocate further entries with flow_alloc()
- * - It's not safe to return to the main epoll loop (use FLOW_START()
- * to move to START state before doing so)
- * - It's not safe to use flow_*() logging functions
- *
- * START - An entry being prepared by flow type specific code
- * Operations:
- * - Flow type specific fields may be accessed
- * - flow_*() logging functions
- * - flow_alloc_cancel() returns the entry to FREE state
- * Caveats:
- * - Returning to the main epoll loop or allocating another entry
- * with flow_alloc() implicitly moves the entry to ACTIVE state.
- *
- * ACTIVE - An active flow entry managed by flow type specific code
- * Operations:
- * - Flow type specific fields may be accessed
- * - flow_*() logging functions
- * - Flow may be expired by returning 'true' from flow type specific
- * deferred or timer handler. This will return it to FREE state.
- * Caveats:
- * - It's not safe to call flow_alloc_cancel()
- */
-
/**
* DOC: Theory of Operation - allocating and freeing flow entries
*
@@ -132,6 +101,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
unsigned flow_first_free;
union flow flowtab[FLOW_MAX];
+static const union flow *flow_new_entry; /* = NULL */
/* Last time the flow timers ran */
static struct timespec flow_timer_run;
@@ -144,6 +114,7 @@ static struct timespec flow_timer_run;
*/
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
{
+ const char *typestate;
char msg[BUFSIZ];
va_list args;
@@ -151,40 +122,65 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
(void)vsnprintf(msg, sizeof(msg), fmt, args);
va_end(args);
- logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg);
+ /* Show type if it's set, otherwise the state */
+ if (f->state < FLOW_STATE_TYPED)
+ typestate = FLOW_STATE(f);
+ else
+ typestate = FLOW_TYPE(f);
+
+ logmsg(pri, "Flow %u (%s): %s", flow_idx(f), typestate, msg);
+}
+
+/**
+ * flow_set_state() - Change flow's state
+ * @f: Flow to update
+ * @state: New state
+ */
+static void flow_set_state(struct flow_common *f, enum flow_state state)
+{
+ uint8_t oldstate = f->state;
+
+ ASSERT(state < FLOW_NUM_STATES);
+ ASSERT(oldstate < FLOW_NUM_STATES);
+
+ f->state = state;
+ flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate],
+ FLOW_STATE(f));
}
/**
- * flow_start() - Set flow type for new flow and log
- * @flow: Flow to set type for
+ * flow_set_type() - Set type and mvoe to TYPED state
+ * @flow: Flow to change state
* @type: Type for new flow
* @iniside: Which side initiated the new flow
*
* Return: @flow
- *
- * Should be called before setting any flow type specific fields in the flow
- * table entry.
*/
-union flow *flow_start(union flow *flow, enum flow_type type,
- unsigned iniside)
+union flow *flow_set_type(union flow *flow, enum flow_type type,
+ unsigned iniside)
{
+ struct flow_common *f = &flow->f;
+
+ ASSERT(type != FLOW_TYPE_NONE);
+ ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW);
+ ASSERT(f->type == FLOW_TYPE_NONE);
+
(void)iniside;
- flow->f.type = type;
- flow_dbg(flow, "START %s", flow_type_str[flow->f.type]);
+ f->type = type;
+ flow_set_state(f, FLOW_STATE_TYPED);
return flow;
}
/**
- * flow_end() - Clear flow type for finished flow and log
- * @flow: Flow to clear
+ * flow_activate() - Move flow to ACTIVE state
+ * @f: Flow to change state
*/
-static void flow_end(union flow *flow)
+void flow_activate(struct flow_common *f)
{
- if (flow->f.type == FLOW_TYPE_NONE)
- return; /* Nothing to do */
+ ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED);
- flow_dbg(flow, "END %s", flow_type_str[flow->f.type]);
- flow->f.type = FLOW_TYPE_NONE;
+ flow_set_state(f, FLOW_STATE_ACTIVE);
+ flow_new_entry = NULL;
}
/**
@@ -196,9 +192,12 @@ union flow *flow_alloc(void)
{
union flow *flow = &flowtab[flow_first_free];
+ ASSERT(!flow_new_entry);
+
if (flow_first_free >= FLOW_MAX)
return NULL;
+ ASSERT(flow->f.state == FLOW_STATE_FREE);
ASSERT(flow->f.type == FLOW_TYPE_NONE);
ASSERT(flow->free.n >= 1);
ASSERT(flow_first_free + flow->free.n <= FLOW_MAX);
@@ -221,7 +220,10 @@ union flow *flow_alloc(void)
flow_first_free = flow->free.next;
}
+ flow_new_entry = flow;
memset(flow, 0, sizeof(*flow));
+ flow_set_state(&flow->f, FLOW_STATE_NEW);
+
return flow;
}
@@ -233,15 +235,21 @@ union flow *flow_alloc(void)
*/
void flow_alloc_cancel(union flow *flow)
{
+ ASSERT(flow_new_entry == flow);
+ ASSERT(flow->f.state == FLOW_STATE_NEW ||
+ flow->f.state == FLOW_STATE_TYPED);
ASSERT(flow_first_free > FLOW_IDX(flow));
- flow_end(flow);
+ flow_set_state(&flow->f, FLOW_STATE_FREE);
+ memset(flow, 0, sizeof(*flow));
+
/* Put it back in a length 1 free cluster, don't attempt to fully
* reverse flow_alloc()s steps. This will get folded together the next
* time flow_defer_handler runs anyway() */
flow->free.n = 1;
flow->free.next = flow_first_free;
flow_first_free = FLOW_IDX(flow);
+ flow_new_entry = NULL;
}
/**
@@ -265,7 +273,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
union flow *flow = &flowtab[idx];
bool closed = false;
- if (flow->f.type == FLOW_TYPE_NONE) {
+ switch (flow->f.state) {
+ case FLOW_STATE_FREE: {
unsigned skip = flow->free.n;
/* First entry of a free cluster must have n >= 1 */
@@ -287,6 +296,20 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
continue;
}
+ case FLOW_STATE_NEW:
+ case FLOW_STATE_TYPED:
+ flow_err(flow, "Incomplete flow at end of cycle");
+ ASSERT(false);
+ break;
+
+ case FLOW_STATE_ACTIVE:
+ /* Nothing to do */
+ break;
+
+ default:
+ ASSERT(false);
+ }
+
switch (flow->f.type) {
case FLOW_TYPE_NONE:
ASSERT(false);
@@ -310,7 +333,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
}
if (closed) {
- flow_end(flow);
+ flow_set_state(&flow->f, FLOW_STATE_FREE);
+ memset(flow, 0, sizeof(*flow));
if (free_head) {
/* Add slot to current free cluster */
diff --git a/flow.h b/flow.h
index c943c44..073a734 100644
--- a/flow.h
+++ b/flow.h
@@ -9,6 +9,66 @@
#define FLOW_TIMER_INTERVAL 1000 /* ms */
+/**
+ * enum flow_state - States of a flow table entry
+ *
+ * An individual flow table entry moves through these states, usually in this
+ * order.
+ * General rules:
+ * - Code outside flow.c should never write common fields of union flow.
+ * - The state field may always be read.
+ *
+ * FREE - Part of the general pool of free flow table entries
+ * Operations:
+ * - flow_alloc() finds an entry and moves it to NEW state
+ *
+ * NEW - Freshly allocated, uninitialised entry
+ * Operations:
+ * - flow_alloc_cancel() returns the entry to FREE state
+ * - FLOW_SET_TYPE() sets the entry's type and moves to TYPED state
+ * Caveats:
+ * - No fields other than state may be accessed.
+ * - At most one entry may be in NEW or TYPED state at a time, so it's
+ * unsafe to use flow_alloc() again until this entry moves to
+ * ACTIVE or FREE state
+ * - You may not return to the main epoll loop while an entry is in
+ * NEW state.
+ *
+ * TYPED - Generic info initialised, type specific initialisation underway
+ * Operations:
+ * - All common fields may be read
+ * - Type specific fields may be read and written
+ * - flow_alloc_cancel() returns the entry to FREE state
+ * - FLOW_ACTIVATE() moves the entry to ACTIVE STATE
+ * Caveats:
+ * - At most one entry may be in NEW or TYPED state at a time, so it's
+ * unsafe to use flow_alloc() again until this entry moves to
+ * ACTIVE or FREE state
+ * - You may not return to the main epoll loop while an entry is in
+ * TYPED state.
+ *
+ * ACTIVE - An active, fully-initialised flow entry
+ * Operations:
+ * - All common fields may be read
+ * - Type specific fields may be read and written
+ * - Flow may be expired by returning 'true' from flow type specific
+ * deferred or timer handler. This will return it to FREE state.
+ * Caveats:
+ * - flow_alloc_cancel() may not be called on it
+ */
+enum flow_state {
+ FLOW_STATE_FREE,
+ FLOW_STATE_NEW,
+ FLOW_STATE_TYPED,
+ FLOW_STATE_ACTIVE,
+
+ FLOW_NUM_STATES,
+};
+
+extern const char *flow_state_str[];
+#define FLOW_STATE(f) \
+ ((f)->state < FLOW_NUM_STATES ? flow_state_str[(f)->state] : "?")
+
/**
* enum flow_type - Different types of packet flows we track
*/
@@ -37,9 +97,11 @@ extern const uint8_t flow_proto[];
/**
* struct flow_common - Common fields for packet flows
+ * @state: State of the flow table entry
* @type: Type of packet flow
*/
struct flow_common {
+ uint8_t state;
uint8_t type;
};
@@ -49,11 +111,6 @@ struct flow_common {
#define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */
#define FLOW_FILE_PRESSURE 30 /* % of c->nofile */
-union flow *flow_start(union flow *flow, enum flow_type type,
- unsigned iniside);
-#define FLOW_START(flow_, t_, var_, i_) \
- (&flow_start((flow_), (t_), (i_))->var_)
-
/**
* struct flow_sidx - ID for one side of a specific flow
* @side: Side referenced (0 or 1)
diff --git a/flow_table.h b/flow_table.h
index b7e5529..58014d8 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -107,4 +107,14 @@ static inline flow_sidx_t flow_sidx(const struct flow_common *f,
union flow *flow_alloc(void);
void flow_alloc_cancel(union flow *flow);
+union flow *flow_set_type(union flow *flow, enum flow_type type,
+ unsigned iniside);
+#define FLOW_SET_TYPE(flow_, t_, var_, i_) \
+ (&flow_set_type((flow_), (t_), (i_))->var_)
+
+void flow_activate(struct flow_common *f);
+#define FLOW_ACTIVATE(flow_) \
+ (flow_activate(&(flow_)->f))
+
+
#endif /* FLOW_TABLE_H */
diff --git a/icmp.c b/icmp.c
index 1c5cf84..fda868d 100644
--- a/icmp.c
+++ b/icmp.c
@@ -167,7 +167,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
if (!flow)
return NULL;
- pingf = FLOW_START(flow, flowtype, ping, TAPSIDE);
+ pingf = FLOW_SET_TYPE(flow, flowtype, ping, TAPSIDE);
pingf->seq = -1;
pingf->id = id;
@@ -198,6 +198,8 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
*id_sock = pingf;
+ FLOW_ACTIVATE(pingf);
+
return pingf;
cancel:
diff --git a/tcp.c b/tcp.c
index 21d0af0..65208ca 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2006,7 +2006,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
goto cancel;
}
- conn = FLOW_START(flow, FLOW_TCP, tcp, TAPSIDE);
+ conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp, TAPSIDE);
conn->sock = s;
conn->timer = -1;
conn_event(c, conn, TAP_SYN_RCVD);
@@ -2077,6 +2077,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
}
tcp_epoll_ctl(c, conn);
+ FLOW_ACTIVATE(conn);
return;
cancel:
@@ -2724,7 +2725,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport,
const union sockaddr_inany *sa,
const struct timespec *now)
{
- struct tcp_tap_conn *conn = FLOW_START(flow, FLOW_TCP, tcp, SOCKSIDE);
+ struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp,
+ SOCKSIDE);
conn->sock = s;
conn->timer = -1;
@@ -2747,6 +2749,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport,
conn_flag(c, conn, ACK_FROM_TAP_DUE);
tcp_get_sndbuf(conn);
+
+ FLOW_ACTIVATE(conn);
}
/**
diff --git a/tcp_splice.c b/tcp_splice.c
index 4c36b72..abe98a0 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -472,7 +472,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c,
return false;
}
- conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0);
+ conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice, 0);
conn->flags = af == AF_INET ? 0 : SPLICE_V6;
conn->s[0] = s0;
@@ -486,6 +486,8 @@ bool tcp_splice_conn_from_sock(const struct ctx *c,
if (tcp_splice_connect(c, conn, af, pif1, dstport))
conn_flag(c, conn, CLOSING);
+ FLOW_ACTIVATE(conn);
+
return true;
}
--
@@ -472,7 +472,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c,
return false;
}
- conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0);
+ conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice, 0);
conn->flags = af == AF_INET ? 0 : SPLICE_V6;
conn->s[0] = s0;
@@ -486,6 +486,8 @@ bool tcp_splice_conn_from_sock(const struct ctx *c,
if (tcp_splice_connect(c, conn, af, pif1, dstport))
conn_flag(c, conn, CLOSING);
+ FLOW_ACTIVATE(conn);
+
return true;
}
--
2.45.0
next prev parent reply other threads:[~2024-05-14 1:03 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-05-14 1:03 [PATCH v5 00/19] RFC: Unified flow table David Gibson
2024-05-14 1:03 ` David Gibson [this message]
2024-05-16 9:30 ` [PATCH v5 01/19] flow: Clarify and enforce flow state transitions Stefano Brivio
[not found] ` <ZkbVxtvmP7f0aL1S@zatzit>
2024-05-17 11:00 ` Stefano Brivio
2024-05-18 6:47 ` David Gibson
2024-05-14 1:03 ` [PATCH v5 02/19] flow: Make side 0 always be the initiating side David Gibson
2024-05-16 12:06 ` Stefano Brivio
2024-05-14 1:03 ` [PATCH v5 03/19] flow: Record the pifs for each side of each flow David Gibson
2024-05-14 1:03 ` [PATCH v5 04/19] tcp: Remove interim 'tapside' field from connection David Gibson
2024-05-14 1:03 ` [PATCH v5 05/19] flow: Common data structures for tracking flow addresses David Gibson
2024-05-14 1:03 ` [PATCH v5 06/19] flow: Populate address information for initiating side David Gibson
[not found] ` <20240516202337.1b90e5f2@elisabeth>
[not found] ` <ZkbcwkdEwjGv6uwG@zatzit>
[not found] ` <20240517215845.4d09eaae@elisabeth>
2024-05-18 7:00 ` David Gibson
2024-05-14 1:03 ` [PATCH v5 07/19] flow: Populate address information for non-initiating side David Gibson
2024-05-14 1:03 ` [PATCH v5 08/19] tcp, flow: Remove redundant information, repack connection structures David Gibson
2024-05-14 1:03 ` [PATCH v5 09/19] tcp: Obtain guest address from flowside David Gibson
2024-05-14 1:03 ` [PATCH v5 10/19] tcp: Simplify endpoint validation using flowside information David Gibson
2024-05-14 1:03 ` [PATCH v5 11/19] tcp_splice: Eliminate SPLICE_V6 flag David Gibson
2024-05-14 1:03 ` [PATCH v5 12/19] tcp, flow: Replace TCP specific hash function with general flow hash David Gibson
2024-05-14 1:03 ` [PATCH v5 13/19] flow, tcp: Generalise TCP hash table to general flow hash table David Gibson
2024-05-14 1:03 ` [PATCH v5 14/19] tcp: Re-use flow hash for initial sequence number generation David Gibson
2024-05-14 1:03 ` [PATCH v5 15/19] icmp: Use flowsides as the source of truth wherever possible David Gibson
[not found] ` <20240516225350.06aebcd7@elisabeth>
[not found] ` <ZkcAHhCpx3F0SW2K@zatzit>
[not found] ` <20240517221123.1c7197a3@elisabeth>
2024-05-18 7:08 ` David Gibson
2024-05-14 1:03 ` [PATCH v5 16/19] icmp: Look up ping flows using flow hash David Gibson
2024-05-14 1:03 ` [PATCH v5 17/19] icmp: Eliminate icmp_id_map David Gibson
2024-05-14 1:03 ` [PATCH v5 18/19] flow, tcp: Flow based NAT and port forwarding for TCP David Gibson
[not found] ` <20240518001345.2d127b09@elisabeth>
2024-05-20 5:44 ` David Gibson
2024-05-14 1:03 ` [PATCH v5 19/19] flow, icmp: Use general flow forwarding rules for ICMP David Gibson
[not found] ` <20240518001408.004011b2@elisabeth>
2024-05-20 5:56 ` David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240514010337.1104606-2-david@gibson.dropbear.id.au \
--to=david@gibson.dropbear.id.au \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).