From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mail.ozlabs.org (gandalf.ozlabs.org [150.107.74.76]) by passt.top (Postfix) with ESMTPS id 558A95A05BD for ; Fri, 3 May 2024 03:11:47 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gibson.dropbear.id.au; s=202312; t=1714698697; bh=V9QcvsNeAJFEWHHhhpDKQbfR1Rd7fMqLY+GUjwy5Tvk=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=Cn8v5jgxYv/onvQ98SSY3Y7/5MPVnMu1+qiFgXjW/e7kgHDpC8VwuN+RxUuL3LoTm dnVktr4mQEO32h/FaW6yw0Xo0RbBS4zyHYN95fj5mP7tEwg8kdzybaEXB+PhKGxLJa Bo5brcylqM1GRQZLN0FxgbGAQ2CSxSGQSGbWDamtlMXVC6wAITYiANyuF1+ERwhy3U 44IWk2ksWtDDTHf/ynR81AddZNuDLfktKNlWAxv3o38GulQOmbl5LdrhsyifNhzP+a D5hdvH3R4aRTCimMh5Ke5RVxrwDZoFaCHOiW0lVwSHiXapaztXRnwE2Rkd4dL8DvyU kbvZeOXb2iW3w== Received: by gandalf.ozlabs.org (Postfix, from userid 1007) id 4VVt6x5vsZz4xPM; Fri, 3 May 2024 11:11:37 +1000 (AEST) From: David Gibson To: passt-dev@passt.top, Stefano Brivio Subject: [PATCH v4 09/16] flow, tcp: Generalise TCP hash table to general flow hash table Date: Fri, 3 May 2024 11:11:28 +1000 Message-ID: <20240503011135.2924437-10-david@gibson.dropbear.id.au> X-Mailer: git-send-email 2.44.0 In-Reply-To: <20240503011135.2924437-1-david@gibson.dropbear.id.au> References: <20240503011135.2924437-1-david@gibson.dropbear.id.au> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Message-ID-Hash: AYI3D52JHASI4ILEPMSUDGX7NG6WK4BC X-Message-ID-Hash: AYI3D52JHASI4ILEPMSUDGX7NG6WK4BC X-MailFrom: dgibson@gandalf.ozlabs.org X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; emergency; loop; banned-address; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header CC: David Gibson X-Mailman-Version: 3.3.8 Precedence: list List-Id: Development discussion and patches for passt Archived-At: Archived-At: List-Archive: List-Archive: List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: Move the data structures and helper functions for the TCP hash table to flow.c, making it a general hash table indexing sides of flows. This is largely code motion and straightforward renames. There are two semantic changes: * flow_hash_lookup() now needs to verify that the entry has a matching protocol as well as matching addresses, ports and interface * We double the size of the hash table, because it's now at least theoretically possible for both sides of each flow to be hashed. Signed-off-by: David Gibson --- flow.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++- flow.h | 10 ++-- tcp.c | 146 +++++---------------------------------------------------- 3 files changed, 147 insertions(+), 138 deletions(-) diff --git a/flow.c b/flow.c index 370d422..8cefbed 100644 --- a/flow.c +++ b/flow.c @@ -138,6 +138,16 @@ union flow flowtab[FLOW_MAX]; /* Last time the flow timers ran */ static struct timespec flow_timer_run; +/* Hash table to index it */ +#define FLOW_HASH_LOAD 70 /* % */ +#define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD)) + +/* Table for lookup from flowside information */ +static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE]; + +static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX, +"Safe linear probing requires hash table with more entries than the number of sides in the flow table"); + /** flow_log_ - Log flow-related message * @f: flow the message is related to * @pri: Log priority @@ -276,8 +286,8 @@ void flow_alloc_cancel(union flow *flow) * * Return: hash value */ -uint64_t flow_hash(const struct ctx *c, uint8_t proto, - const struct flowside *fside) +static uint64_t flow_hash(const struct ctx *c, uint8_t proto, + const struct flowside *fside) { struct siphash_state state = SIPHASH_INIT(c->hash_secret); @@ -296,6 +306,116 @@ uint64_t flow_hash(const struct ctx *c, uint8_t proto, fside->fport << 16 | fside->eport); } +/** + * flow_sidx_hash() - Calculate hash value for given side of a given flow + * @c: Execution context + * @sidx: Flow & side index to get hash for + * + * Return: hash value, of the flow & side represented by @sidx + */ +static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx) +{ + const struct flow_common *f = &flow_at_sidx(sidx)->f; + return flow_hash(c, FLOW_PROTO(f), &f->side[sidx.side]); +} + +/** + * flow_hash_probe() - Find hash bucket for a flow + * @c: Execution context + * @sidx: Flow and side to find bucket for + * + * Return: If @sidx is in the hash table, its current bucket, otherwise a + * suitable free bucket for it. + */ +static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx) +{ + unsigned b = flow_sidx_hash(c, sidx) % FLOW_HASH_SIZE; + + /* Linear probing */ + while (!flow_sidx_eq(flow_hashtab[b], FLOW_SIDX_NONE) && + !flow_sidx_eq(flow_hashtab[b], sidx)) + b = mod_sub(b, 1, FLOW_HASH_SIZE); + + return b; +} + +/** + * flow_hash_insert() - Insert side of a flow into into hash table + * @c: Execution context + * @sidx: Flow & side index + */ +void flow_hash_insert(const struct ctx *c, flow_sidx_t sidx) +{ + unsigned b = flow_hash_probe(c, sidx); + + flow_hashtab[b] = sidx; + flow_dbg(flow_at_sidx(sidx), "hash table insert: bucket: %u", b); +} + +/** + * flow_hash_remove() - Drop side of a flow from the hash table + * @c: Execution context + * @sidx: Side of flow to remove + */ +void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx) +{ + unsigned b = flow_hash_probe(c, sidx), s; + + if (flow_sidx_eq(flow_hashtab[b], FLOW_SIDX_NONE)) + return; /* Redundant remove */ + + flow_dbg(flow_at_sidx(sidx), "hash table remove: bucket: %u", b); + + /* Scan the remainder of the cluster */ + for (s = mod_sub(b, 1, FLOW_HASH_SIZE); + !flow_sidx_eq(flow_hashtab[s], FLOW_SIDX_NONE); + s = mod_sub(s, 1, FLOW_HASH_SIZE)) { + unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE; + + if (!mod_between(h, s, b, FLOW_HASH_SIZE)) { + /* flow_hashtab[s] can live in flow_hashtab[b]'s slot */ + debug("hash table remove: shuffle %u -> %u", s, b); + flow_hashtab[b] = flow_hashtab[s]; + b = s; + } + } + + flow_hashtab[b] = FLOW_SIDX_NONE; +} + +/** + * flow_hash_lookup() - Look up a flow given addressing information + * @c: Execution context + * @proto: Protocol of the flow (IP L4 protocol number) + * @pif: Interface of the flow + * @af: Address family, AF_INET or AF_INET6 + * @eaddr: Guest side endpoint address (guest local address) + * @faddr: Guest side forwarding address (guest remote address) + * @eport: Guest side endpoint port (guest local port) + * @fport: Guest side forwarding port (guest remote port) + * + * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found + */ +flow_sidx_t flow_hash_lookup(const struct ctx *c, + uint8_t proto, uint8_t pif, sa_family_t af, + const void *eaddr, const void *faddr, + in_port_t eport, in_port_t fport) +{ + struct flowside fside; + union flow *flow; + int b; + + flowside_from_af(&fside, pif, af, faddr, fport, eaddr, eport); + + b = flow_hash(c, proto, &fside) % FLOW_HASH_SIZE; + while ((flow = flow_at_sidx(flow_hashtab[b])) && + FLOW_PROTO(&flow->f) == proto && + !flowside_eq(&flow->f.side[flow_hashtab[b].side], &fside)) + b = (b + 1) % FLOW_HASH_SIZE; + + return flow_hashtab[b]; +} + /** * flow_defer_handler() - Handler for per-flow deferred and timed tasks * @c: Execution context @@ -389,7 +509,12 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) */ void flow_init(void) { + unsigned b; + /* Initial state is a single free cluster containing the whole table */ flowtab[0].free.n = FLOW_MAX; flowtab[0].free.next = FLOW_MAX; + + for (b = 0; b < FLOW_HASH_SIZE; b++) + flow_hashtab[b] = FLOW_SIDX_NONE; } diff --git a/flow.h b/flow.h index 218edc6..c1181cc 100644 --- a/flow.h +++ b/flow.h @@ -144,9 +144,6 @@ union flow *flow_start(union flow *flow, enum flow_type type, #define FLOW_START(flow_, t_, var_, i_) \ (&flow_start((flow_), (t_), (i_))->var_) -uint64_t flow_hash(const struct ctx *c, uint8_t proto, - const struct flowside *fside); - /** * struct flow_sidx - ID for one side of a specific flow * @side: Side referenced (0 or 1) @@ -172,6 +169,13 @@ static inline bool flow_sidx_eq(flow_sidx_t a, flow_sidx_t b) return (a.flow == b.flow) && (a.side == b.side); } +void flow_hash_insert(const struct ctx *c, flow_sidx_t sidx); +void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx); +flow_sidx_t flow_hash_lookup(const struct ctx *c, + uint8_t proto, uint8_t pif, sa_family_t af, + const void *eaddr, const void *faddr, + in_port_t eport, in_port_t fport); + union flow; void flow_init(void); diff --git a/tcp.c b/tcp.c index 2ad47c5..7f83014 100644 --- a/tcp.c +++ b/tcp.c @@ -311,9 +311,6 @@ #define TCP_FRAMES \ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1) -#define TCP_HASH_TABLE_LOAD 70 /* % */ -#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD) - #define MAX_WS 8 #define MAX_WINDOW (1 << (16 + (MAX_WS))) @@ -524,12 +521,6 @@ static struct iovec tcp_iov [UIO_MAXIOV]; #define CONN(idx) (&(FLOW(idx)->tcp)) -/* Table for lookup from flowside information */ -static flow_sidx_t tc_hash[TCP_HASH_TABLE_SIZE]; - -static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, - "Safe linear probing requires hash table larger than connection table"); - /* Pools for pre-opened sockets (in init) */ int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; @@ -723,9 +714,6 @@ static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn, tcp_timer_ctl(c, conn); } -static void tcp_hash_remove(const struct ctx *c, - const struct tcp_tap_conn *conn); - /** * conn_event_do() - Set and log connection events, update epoll state * @c: Execution context @@ -771,7 +759,7 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, num == -1 ? "CLOSED" : tcp_event_str[num]); if (event == CLOSED) - tcp_hash_remove(c, conn); + flow_hash_remove(c, FLOW_SIDX(conn, TAPSIDE)); else if ((event == TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_RCVD)) conn_flag(c, conn, ACTIVE_CLOSE); else @@ -1074,116 +1062,6 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find, return -1; } -/** - * tcp_conn_hash() - Calculate hash bucket of an existing connection - * @c: Execution context - * @conn: Connection - * - * Return: hash value, needs to be adjusted for table size - */ -static uint64_t tcp_conn_hash(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - return flow_hash(c, IPPROTO_TCP, &conn->f.side[TAPSIDE]); -} - -/** - * tcp_hash_probe() - Find hash bucket for a connection - * @c: Execution context - * @conn: Connection to find bucket for - * - * Return: If @conn is in the table, its current bucket, otherwise a suitable - * free bucket for it. - */ -static inline unsigned tcp_hash_probe(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - flow_sidx_t sidx = FLOW_SIDX(conn, TAPSIDE); - unsigned b = tcp_conn_hash(c, conn) % TCP_HASH_TABLE_SIZE; - - /* Linear probing */ - while (!flow_sidx_eq(tc_hash[b], FLOW_SIDX_NONE) && - !flow_sidx_eq(tc_hash[b], sidx)) - b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - - return b; -} - -/** - * tcp_hash_insert() - Insert connection into hash table, chain link - * @c: Execution context - * @conn: Connection pointer - */ -static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn) -{ - unsigned b = tcp_hash_probe(c, conn); - - tc_hash[b] = FLOW_SIDX(conn, TAPSIDE); - flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b); -} - -/** - * tcp_hash_remove() - Drop connection from hash table, chain unlink - * @c: Execution context - * @conn: Connection pointer - */ -static void tcp_hash_remove(const struct ctx *c, - const struct tcp_tap_conn *conn) -{ - unsigned b = tcp_hash_probe(c, conn), s; - union flow *flow = flow_at_sidx(tc_hash[b]); - - if (!flow) - return; /* Redundant remove */ - - flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b); - - /* Scan the remainder of the cluster */ - for (s = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - (flow = flow_at_sidx(tc_hash[s])); - s = mod_sub(s, 1, TCP_HASH_TABLE_SIZE)) { - unsigned h = tcp_conn_hash(c, &flow->tcp) % TCP_HASH_TABLE_SIZE; - - if (!mod_between(h, s, b, TCP_HASH_TABLE_SIZE)) { - /* tc_hash[s] can live in tc_hash[b]'s slot */ - debug("hash table remove: shuffle %u -> %u", s, b); - tc_hash[b] = tc_hash[s]; - b = s; - } - } - - tc_hash[b] = FLOW_SIDX_NONE; -} - -/** - * tcp_hash_lookup() - Look up connection given remote address and ports - * @c: Execution context - * @af: Address family, AF_INET or AF_INET6 - * @eaddr: Guest side endpoint address (guest local address) - * @faddr: Guest side forwarding address (guest remote address) - * @eport: Guest side endpoint port (guest local port) - * @fport: Guest side forwarding port (guest remote port) - * - * Return: connection pointer, if found, -ENOENT otherwise - */ -static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c, sa_family_t af, - const void *eaddr, const void *faddr, - in_port_t eport, in_port_t fport) -{ - struct flowside fside; - union flow *flow; - unsigned b; - - flowside_from_af(&fside, PIF_TAP, af, faddr, fport, eaddr, eport); - - b = flow_hash(c, IPPROTO_TCP, &fside) % TCP_HASH_TABLE_SIZE; - while ((flow = flow_at_sidx(tc_hash[b])) && - !flowside_eq(&flow->f.side[TAPSIDE], &fside)) - b = mod_sub(b, 1, TCP_HASH_TABLE_SIZE); - - return &flow->tcp; -} - /** * tcp_flow_defer() - Deferred per-flow handling (clean up closed connections) * @flow: Flow table entry for this connection @@ -1970,7 +1848,7 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, tcp_seq_init(c, conn, now); conn->seq_ack_from_tap = conn->seq_to_tap; - tcp_hash_insert(c, conn); + flow_hash_insert(c, FLOW_SIDX(conn, TAPSIDE)); sockaddr_from_inany(&sa, &sl, &sockside->eaddr, sockside->eport, c->ifi6); @@ -2466,6 +2344,8 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, const struct tcphdr *th; size_t optlen, len; const char *opts; + union flow *flow; + flow_sidx_t sidx; int ack_due = 0; int count; @@ -2481,17 +2361,22 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, optlen = MIN(optlen, ((1UL << 4) /* from doff width */ - 6) * 4UL); opts = packet_get(p, idx, sizeof(*th), optlen, NULL); - conn = tcp_hash_lookup(c, af, saddr, daddr, - ntohs(th->source), ntohs(th->dest)); + sidx = flow_hash_lookup(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr, + ntohs(th->source), ntohs(th->dest)); + flow = flow_at_sidx(sidx); /* New connection from tap */ - if (!conn) { + if (!flow) { if (opts && th->syn && !th->ack) tcp_conn_from_tap(c, af, saddr, daddr, th, opts, optlen, now); return 1; } + ASSERT(flow->f.type == FLOW_TCP); + ASSERT(sidx.side == TAPSIDE); + conn = &flow->tcp; + flow_trace(conn, "packet length %zu from tap", len); if (th->rst) { @@ -2671,7 +2556,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, conn_event(c, conn, SOCK_ACCEPTED); tcp_seq_init(c, conn, now); - tcp_hash_insert(c, conn); + flow_hash_insert(c, FLOW_SIDX(conn, TAPSIDE)); conn->seq_ack_from_tap = conn->seq_to_tap; @@ -3060,11 +2945,6 @@ static void tcp_sock_refill_init(const struct ctx *c) */ int tcp_init(struct ctx *c) { - unsigned b; - - for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) - tc_hash[b] = FLOW_SIDX_NONE; - if (c->ifi4) tcp_sock4_iov_init(c); -- 2.44.0