From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mail.ozlabs.org (gandalf.ozlabs.org [150.107.74.76]) by passt.top (Postfix) with ESMTPS id 8C13B5A02D5 for ; Tue, 14 May 2024 03:03:46 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gibson.dropbear.id.au; s=202312; t=1715648621; bh=XpE5cg0d2VB1/7/oxM2n0tGtZoVwSPn1U4tH5nZCu/c=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=k9k1KMmkSPIhSSeFMk5yeZCsHn02BYDcAT2GQfSqCMSk7o7blVQY4uYYhohuCczDY uMsJm5yDQ/dP5OTBaya7rs6oSOBv20sk8Iwam5M2Sl8SR28OuTXyunow7nCJwlSkA0 jix0yqyHNAN37tHUwJJxux+bp3UQ4xhKE515nQzQPu4w3c4wXaJ6cchBCKqI2xAXaP MN74TS97O496sWWB5xEKyhXa6RfKZAvjNPw4L0noqiewhrz1bvdskxcekgpARc+2Bk QZxSkHuD0TPSDHcnMnMtqNeneRv9FwNxUWzzs7p8SfaHCsThG3yEWN5rlfPkOoaHYF aYItvf0QwZN+Q== Received: by gandalf.ozlabs.org (Postfix, from userid 1007) id 4VddQj6Kczz4wcg; Tue, 14 May 2024 11:03:41 +1000 (AEST) From: David Gibson To: Stefano Brivio , passt-dev@passt.top Subject: [PATCH v5 01/19] flow: Clarify and enforce flow state transitions Date: Tue, 14 May 2024 11:03:19 +1000 Message-ID: <20240514010337.1104606-2-david@gibson.dropbear.id.au> X-Mailer: git-send-email 2.45.0 In-Reply-To: <20240514010337.1104606-1-david@gibson.dropbear.id.au> References: <20240514010337.1104606-1-david@gibson.dropbear.id.au> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Message-ID-Hash: 6QDWA3JEIDBDWZKDTXGBP5V7CK5UVVDP X-Message-ID-Hash: 6QDWA3JEIDBDWZKDTXGBP5V7CK5UVVDP X-MailFrom: dgibson@gandalf.ozlabs.org X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; emergency; loop; banned-address; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header CC: David Gibson X-Mailman-Version: 3.3.8 Precedence: list List-Id: Development discussion and patches for passt Archived-At: Archived-At: List-Archive: List-Archive: List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: Flows move over several different states in their lifetime. The rules for these are documented in comments, but they're pretty complex and a number of the transitions are implicit, which makes this pretty fragile and error prone. Change the code to explicitly track the states in a field. Make all transitions explicit and logged. To the extent that it's practical in C, enforce what can and can't be done in various states with ASSERT()s. While we're at it, tweak the docs to clarify the restrictions on each state a bit. Signed-off-by: David Gibson --- flow.c | 144 ++++++++++++++++++++++++++++++--------------------- flow.h | 67 ++++++++++++++++++++++-- flow_table.h | 10 ++++ icmp.c | 4 +- tcp.c | 8 ++- tcp_splice.c | 4 +- 6 files changed, 168 insertions(+), 69 deletions(-) diff --git a/flow.c b/flow.c index 80dd269..768e0f6 100644 --- a/flow.c +++ b/flow.c @@ -18,6 +18,15 @@ #include "flow.h" #include "flow_table.h" +const char *flow_state_str[] = { + [FLOW_STATE_FREE] = "FREE", + [FLOW_STATE_NEW] = "NEW", + [FLOW_STATE_TYPED] = "TYPED", + [FLOW_STATE_ACTIVE] = "ACTIVE", +}; +static_assert(ARRAY_SIZE(flow_state_str) == FLOW_NUM_STATES, + "flow_state_str[] doesn't match enum flow_state"); + const char *flow_type_str[] = { [FLOW_TYPE_NONE] = "", [FLOW_TCP] = "TCP connection", @@ -39,46 +48,6 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, /* Global Flow Table */ -/** - * DOC: Theory of Operation - flow entry life cycle - * - * An individual flow table entry moves through these logical states, usually in - * this order. - * - * FREE - Part of the general pool of free flow table entries - * Operations: - * - flow_alloc() finds an entry and moves it to ALLOC state - * - * ALLOC - A tentatively allocated entry - * Operations: - * - flow_alloc_cancel() returns the entry to FREE state - * - FLOW_START() set the entry's type and moves to START state - * Caveats: - * - It's not safe to write fields in the flow entry - * - It's not safe to allocate further entries with flow_alloc() - * - It's not safe to return to the main epoll loop (use FLOW_START() - * to move to START state before doing so) - * - It's not safe to use flow_*() logging functions - * - * START - An entry being prepared by flow type specific code - * Operations: - * - Flow type specific fields may be accessed - * - flow_*() logging functions - * - flow_alloc_cancel() returns the entry to FREE state - * Caveats: - * - Returning to the main epoll loop or allocating another entry - * with flow_alloc() implicitly moves the entry to ACTIVE state. - * - * ACTIVE - An active flow entry managed by flow type specific code - * Operations: - * - Flow type specific fields may be accessed - * - flow_*() logging functions - * - Flow may be expired by returning 'true' from flow type specific - * deferred or timer handler. This will return it to FREE state. - * Caveats: - * - It's not safe to call flow_alloc_cancel() - */ - /** * DOC: Theory of Operation - allocating and freeing flow entries * @@ -132,6 +101,7 @@ static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, unsigned flow_first_free; union flow flowtab[FLOW_MAX]; +static const union flow *flow_new_entry; /* = NULL */ /* Last time the flow timers ran */ static struct timespec flow_timer_run; @@ -144,6 +114,7 @@ static struct timespec flow_timer_run; */ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) { + const char *typestate; char msg[BUFSIZ]; va_list args; @@ -151,40 +122,65 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) (void)vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); - logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); + /* Show type if it's set, otherwise the state */ + if (f->state < FLOW_STATE_TYPED) + typestate = FLOW_STATE(f); + else + typestate = FLOW_TYPE(f); + + logmsg(pri, "Flow %u (%s): %s", flow_idx(f), typestate, msg); +} + +/** + * flow_set_state() - Change flow's state + * @f: Flow to update + * @state: New state + */ +static void flow_set_state(struct flow_common *f, enum flow_state state) +{ + uint8_t oldstate = f->state; + + ASSERT(state < FLOW_NUM_STATES); + ASSERT(oldstate < FLOW_NUM_STATES); + + f->state = state; + flow_log_(f, LOG_DEBUG, "%s -> %s", flow_state_str[oldstate], + FLOW_STATE(f)); } /** - * flow_start() - Set flow type for new flow and log - * @flow: Flow to set type for + * flow_set_type() - Set type and mvoe to TYPED state + * @flow: Flow to change state * @type: Type for new flow * @iniside: Which side initiated the new flow * * Return: @flow - * - * Should be called before setting any flow type specific fields in the flow - * table entry. */ -union flow *flow_start(union flow *flow, enum flow_type type, - unsigned iniside) +union flow *flow_set_type(union flow *flow, enum flow_type type, + unsigned iniside) { + struct flow_common *f = &flow->f; + + ASSERT(type != FLOW_TYPE_NONE); + ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_NEW); + ASSERT(f->type == FLOW_TYPE_NONE); + (void)iniside; - flow->f.type = type; - flow_dbg(flow, "START %s", flow_type_str[flow->f.type]); + f->type = type; + flow_set_state(f, FLOW_STATE_TYPED); return flow; } /** - * flow_end() - Clear flow type for finished flow and log - * @flow: Flow to clear + * flow_activate() - Move flow to ACTIVE state + * @f: Flow to change state */ -static void flow_end(union flow *flow) +void flow_activate(struct flow_common *f) { - if (flow->f.type == FLOW_TYPE_NONE) - return; /* Nothing to do */ + ASSERT(&flow_new_entry->f == f && f->state == FLOW_STATE_TYPED); - flow_dbg(flow, "END %s", flow_type_str[flow->f.type]); - flow->f.type = FLOW_TYPE_NONE; + flow_set_state(f, FLOW_STATE_ACTIVE); + flow_new_entry = NULL; } /** @@ -196,9 +192,12 @@ union flow *flow_alloc(void) { union flow *flow = &flowtab[flow_first_free]; + ASSERT(!flow_new_entry); + if (flow_first_free >= FLOW_MAX) return NULL; + ASSERT(flow->f.state == FLOW_STATE_FREE); ASSERT(flow->f.type == FLOW_TYPE_NONE); ASSERT(flow->free.n >= 1); ASSERT(flow_first_free + flow->free.n <= FLOW_MAX); @@ -221,7 +220,10 @@ union flow *flow_alloc(void) flow_first_free = flow->free.next; } + flow_new_entry = flow; memset(flow, 0, sizeof(*flow)); + flow_set_state(&flow->f, FLOW_STATE_NEW); + return flow; } @@ -233,15 +235,21 @@ union flow *flow_alloc(void) */ void flow_alloc_cancel(union flow *flow) { + ASSERT(flow_new_entry == flow); + ASSERT(flow->f.state == FLOW_STATE_NEW || + flow->f.state == FLOW_STATE_TYPED); ASSERT(flow_first_free > FLOW_IDX(flow)); - flow_end(flow); + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); + /* Put it back in a length 1 free cluster, don't attempt to fully * reverse flow_alloc()s steps. This will get folded together the next * time flow_defer_handler runs anyway() */ flow->free.n = 1; flow->free.next = flow_first_free; flow_first_free = FLOW_IDX(flow); + flow_new_entry = NULL; } /** @@ -265,7 +273,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) union flow *flow = &flowtab[idx]; bool closed = false; - if (flow->f.type == FLOW_TYPE_NONE) { + switch (flow->f.state) { + case FLOW_STATE_FREE: { unsigned skip = flow->free.n; /* First entry of a free cluster must have n >= 1 */ @@ -287,6 +296,20 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) continue; } + case FLOW_STATE_NEW: + case FLOW_STATE_TYPED: + flow_err(flow, "Incomplete flow at end of cycle"); + ASSERT(false); + break; + + case FLOW_STATE_ACTIVE: + /* Nothing to do */ + break; + + default: + ASSERT(false); + } + switch (flow->f.type) { case FLOW_TYPE_NONE: ASSERT(false); @@ -310,7 +333,8 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) } if (closed) { - flow_end(flow); + flow_set_state(&flow->f, FLOW_STATE_FREE); + memset(flow, 0, sizeof(*flow)); if (free_head) { /* Add slot to current free cluster */ diff --git a/flow.h b/flow.h index c943c44..073a734 100644 --- a/flow.h +++ b/flow.h @@ -9,6 +9,66 @@ #define FLOW_TIMER_INTERVAL 1000 /* ms */ +/** + * enum flow_state - States of a flow table entry + * + * An individual flow table entry moves through these states, usually in this + * order. + * General rules: + * - Code outside flow.c should never write common fields of union flow. + * - The state field may always be read. + * + * FREE - Part of the general pool of free flow table entries + * Operations: + * - flow_alloc() finds an entry and moves it to NEW state + * + * NEW - Freshly allocated, uninitialised entry + * Operations: + * - flow_alloc_cancel() returns the entry to FREE state + * - FLOW_SET_TYPE() sets the entry's type and moves to TYPED state + * Caveats: + * - No fields other than state may be accessed. + * - At most one entry may be in NEW or TYPED state at a time, so it's + * unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE state + * - You may not return to the main epoll loop while an entry is in + * NEW state. + * + * TYPED - Generic info initialised, type specific initialisation underway + * Operations: + * - All common fields may be read + * - Type specific fields may be read and written + * - flow_alloc_cancel() returns the entry to FREE state + * - FLOW_ACTIVATE() moves the entry to ACTIVE STATE + * Caveats: + * - At most one entry may be in NEW or TYPED state at a time, so it's + * unsafe to use flow_alloc() again until this entry moves to + * ACTIVE or FREE state + * - You may not return to the main epoll loop while an entry is in + * TYPED state. + * + * ACTIVE - An active, fully-initialised flow entry + * Operations: + * - All common fields may be read + * - Type specific fields may be read and written + * - Flow may be expired by returning 'true' from flow type specific + * deferred or timer handler. This will return it to FREE state. + * Caveats: + * - flow_alloc_cancel() may not be called on it + */ +enum flow_state { + FLOW_STATE_FREE, + FLOW_STATE_NEW, + FLOW_STATE_TYPED, + FLOW_STATE_ACTIVE, + + FLOW_NUM_STATES, +}; + +extern const char *flow_state_str[]; +#define FLOW_STATE(f) \ + ((f)->state < FLOW_NUM_STATES ? flow_state_str[(f)->state] : "?") + /** * enum flow_type - Different types of packet flows we track */ @@ -37,9 +97,11 @@ extern const uint8_t flow_proto[]; /** * struct flow_common - Common fields for packet flows + * @state: State of the flow table entry * @type: Type of packet flow */ struct flow_common { + uint8_t state; uint8_t type; }; @@ -49,11 +111,6 @@ struct flow_common { #define FLOW_TABLE_PRESSURE 30 /* % of FLOW_MAX */ #define FLOW_FILE_PRESSURE 30 /* % of c->nofile */ -union flow *flow_start(union flow *flow, enum flow_type type, - unsigned iniside); -#define FLOW_START(flow_, t_, var_, i_) \ - (&flow_start((flow_), (t_), (i_))->var_) - /** * struct flow_sidx - ID for one side of a specific flow * @side: Side referenced (0 or 1) diff --git a/flow_table.h b/flow_table.h index b7e5529..58014d8 100644 --- a/flow_table.h +++ b/flow_table.h @@ -107,4 +107,14 @@ static inline flow_sidx_t flow_sidx(const struct flow_common *f, union flow *flow_alloc(void); void flow_alloc_cancel(union flow *flow); +union flow *flow_set_type(union flow *flow, enum flow_type type, + unsigned iniside); +#define FLOW_SET_TYPE(flow_, t_, var_, i_) \ + (&flow_set_type((flow_), (t_), (i_))->var_) + +void flow_activate(struct flow_common *f); +#define FLOW_ACTIVATE(flow_) \ + (flow_activate(&(flow_)->f)) + + #endif /* FLOW_TABLE_H */ diff --git a/icmp.c b/icmp.c index 1c5cf84..fda868d 100644 --- a/icmp.c +++ b/icmp.c @@ -167,7 +167,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, if (!flow) return NULL; - pingf = FLOW_START(flow, flowtype, ping, TAPSIDE); + pingf = FLOW_SET_TYPE(flow, flowtype, ping, TAPSIDE); pingf->seq = -1; pingf->id = id; @@ -198,6 +198,8 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, *id_sock = pingf; + FLOW_ACTIVATE(pingf); + return pingf; cancel: diff --git a/tcp.c b/tcp.c index 21d0af0..65208ca 100644 --- a/tcp.c +++ b/tcp.c @@ -2006,7 +2006,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, goto cancel; } - conn = FLOW_START(flow, FLOW_TCP, tcp, TAPSIDE); + conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp, TAPSIDE); conn->sock = s; conn->timer = -1; conn_event(c, conn, TAP_SYN_RCVD); @@ -2077,6 +2077,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, } tcp_epoll_ctl(c, conn); + FLOW_ACTIVATE(conn); return; cancel: @@ -2724,7 +2725,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, const union sockaddr_inany *sa, const struct timespec *now) { - struct tcp_tap_conn *conn = FLOW_START(flow, FLOW_TCP, tcp, SOCKSIDE); + struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp, + SOCKSIDE); conn->sock = s; conn->timer = -1; @@ -2747,6 +2749,8 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, conn_flag(c, conn, ACK_FROM_TAP_DUE); tcp_get_sndbuf(conn); + + FLOW_ACTIVATE(conn); } /** diff --git a/tcp_splice.c b/tcp_splice.c index 4c36b72..abe98a0 100644 --- a/tcp_splice.c +++ b/tcp_splice.c @@ -472,7 +472,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, return false; } - conn = FLOW_START(flow, FLOW_TCP_SPLICE, tcp_splice, 0); + conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice, 0); conn->flags = af == AF_INET ? 0 : SPLICE_V6; conn->s[0] = s0; @@ -486,6 +486,8 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, if (tcp_splice_connect(c, conn, af, pif1, dstport)) conn_flag(c, conn, CLOSING); + FLOW_ACTIVATE(conn); + return true; } -- 2.45.0