/* SPDX-License-Identifier: GPL-2.0-or-later * Copyright Red Hat * Author: David Gibson * * Tracking for logical "flows" of packets. */ #include #include #include #include #include #include "util.h" #include "passt.h" #include "siphash.h" #include "inany.h" #include "flow.h" #include "flow_table.h" const char *flow_type_str[] = { [FLOW_TYPE_NONE] = "", [FLOW_TCP] = "TCP connection", [FLOW_TCP_SPLICE] = "TCP connection (spliced)", }; static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, "flow_type_str[] doesn't match enum flow_type"); const uint8_t flow_proto[] = { [FLOW_TCP] = IPPROTO_TCP, [FLOW_TCP_SPLICE] = IPPROTO_TCP, }; static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, "flow_proto[] doesn't match enum flow_type"); /* Global Flow Table */ unsigned flow_first_free; union flow flowtab[FLOW_MAX]; /* Last time the flow timers ran */ static struct timespec flow_timer_run; /* Hash table to index it */ #define FLOW_HASH_LOAD 70 /* % */ #define FLOW_HASH_SIZE ((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD)) /* Table for lookup from flowside information */ static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE]; static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX, "Safe linear probing requires hash table with more entries than the number of sides in the flow table"); /** flow_log_ - Log flow-related message * @f: flow the message is related to * @pri: Log priority * @fmt: Format string * @...: printf-arguments */ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) { char msg[BUFSIZ]; va_list args; va_start(args, fmt); (void)vsnprintf(msg, sizeof(msg), fmt, args); va_end(args); logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg); } /** * flow_new_dbg() - Print debug message for new flow * @f: Common flow structure * @side: Which side initiated the new flow */ void flow_new_dbg(const struct flow_common *f, unsigned side) { char ebuf[INET6_ADDRSTRLEN], fbuf[INET6_ADDRSTRLEN]; const struct flowside *fside = &f->side[side]; flow_log_(f, LOG_DEBUG, "New %s from %s/%u: [%s]:%hu <-> [%s]:%hu", flow_type_str[f->type], pif_name(fside->pif), side, inet_ntop(AF_INET6, &fside->faddr, fbuf, sizeof(fbuf)), fside->fport, inet_ntop(AF_INET6, &fside->eaddr, ebuf, sizeof(ebuf)), fside->eport); } /** * flow_fwd_dbg() - Print debug message for newly forwarded flow * @f: Common flow structure * @side: Which side was the flow forwarded to */ void flow_fwd_dbg(const struct flow_common *f, unsigned side) { char ebuf[INET6_ADDRSTRLEN], fbuf[INET6_ADDRSTRLEN]; const struct flowside *fside = &f->side[side]; inet_ntop(AF_INET6, &fside->eaddr, ebuf, sizeof(ebuf)); inet_ntop(AF_INET6, &fside->faddr, fbuf, sizeof(fbuf)); flow_log_(f, LOG_DEBUG, "Forwarded to %s/%u: [%s]:%hu <-> [%s]:%hu", pif_name(fside->pif), side, inet_ntop(AF_INET6, &fside->faddr, fbuf, sizeof(fbuf)), fside->fport, inet_ntop(AF_INET6, &fside->eaddr, ebuf, sizeof(ebuf)), fside->eport); } /** flowside_from_sock - Initialize flowside to match an existing socket * @fside: flowside to initialize * @pif: pif id of this flowside * @s: socket * @fsa: Local addr of @s as sockaddr_in or sockaddr_in6, or NULL * @esa: Remote addr of @s as sockaddr_in or sockaddr_in6, or NULL * * If NULL is passed for either @fsa/@esa, we use getsockname()/getpeername() to * obtain the information from the @s. * * #syscalls getsockname getpeername */ int flowside_from_sock(struct flowside *fside, uint8_t pif, int s, const void *fsa, const void *esa) { struct sockaddr_storage sa; fside->pif = pif; if (!fsa) { socklen_t sl = sizeof(sa); if (getsockname(s, (struct sockaddr *)&sa, &sl) < 0) return -errno; fsa = &sa; } inany_from_sockaddr(&fside->faddr, &fside->fport, (const struct sockaddr *)fsa); if (!esa) { socklen_t sl = sizeof(sa); if (getpeername(s, (struct sockaddr *)&sa, &sl) < 0) return -errno; esa = &sa; } inany_from_sockaddr(&fside->eaddr, &fside->eport, (const struct sockaddr *)esa); return 0; } /** * DOC: Theory of Operation - allocation and freeing of flow entries * * Each flow takes a single slot in flowtab[]. Moving entries in that table * (which we used to do) is fiddly and possibly expensive: it requires updating * the hash table indexing flows, and may require updating epoll data which * references the flow by index. However, we also want to keep the active * entries in the table compact where possible, because otherwise scanning * through the entire table becomes more expensive. This describes the * compromise implemented below. * * Free blocks * A "free block" is a contiguous sequence of unused (FLOW_TYPE_NONE) entries * in flowtab. The first entry in each block contains metadata, specifically * the number of entries in the block, and the index of the next (non * contiguous) free block (struct flow_free_block). * * Free block list * flow_first_free gives the index of the first entry of the first (lowest * index) free block. Each free block has the index of the next free block, * or MAX_FLOW if it is the last free block. Together these form a linked * list of free blocks, in strictly increasing order of index. * * Allocation * When allocating a new flow, we always use the first entry of the first * free block, that is, at index flow_first_free. If the block has more than * one entry, flow_first_free is updated to the next entry, which is updated * to represent the new smaller free block. Otherwise the free block is * eliminated and flow_first_free is updated to the next free block. * * Scanning the table * Theoretically, scanning the table requires FLOW_MAX iterations. However, * when we encounter the start of a free block, we can immediately skip to * its end, meaning that in practice we only need (number of active * connections) + (number of free blocks) iterations. * * Freeing * We can only free entries when scanning the whole flow table in * flow_defer_handler(). This is what lets us maintain the fee block list in * index sorted order. As we scan we keep track of whether the previous * entry was in a free block or not. If so when an entry is freed (its * deferred handler returns 'true'), we add it to that free block. Otherwise * we create a new free block for it and chain it to the last free block we * scanned. */ /** * flow_alloc() - Allocate a new flow * * Return: pointer to an unused flow entry, or NULL if the table is full */ union flow *flow_alloc(void) { union flow *flow = &flowtab[flow_first_free]; if (flow_first_free >= FLOW_MAX) return NULL; ASSERT(flow->f.type == FLOW_TYPE_NONE); ASSERT(flow->free.n >= 1); if (flow->free.n > 1) { /* Use one entry from the block */ union flow *next = &flowtab[++flow_first_free]; ASSERT(FLOW_IDX(next) < FLOW_MAX); ASSERT(next->f.type == FLOW_TYPE_NONE); ASSERT(next->free.n == 0); next->free.n = flow->free.n - 1; next->free.next = flow->free.next; } else { /* Use the entire block */ flow_first_free = flow->free.next; } memset(flow, 0, sizeof(*flow)); return flow; } /** * flow_alloc_cancel() - Free a newly allocated flow * @flow: Flow to deallocate * * @flow must be the last flow allocated by flow_alloc() */ void flow_alloc_cancel(union flow *flow) { ASSERT(flow_first_free > FLOW_IDX(flow)); flow->f.type = FLOW_TYPE_NONE; /* Put it back in a length 1 free block, don't attempt to fully reverse * flow_alloc()s steps. This will get folded together the next time * flow_defer_handler runs anyway() */ flow->free.n = 1; flow->free.next = flow_first_free; flow_first_free = FLOW_IDX(flow); } /** * flow_hash() - Calculate hash value for one side of a flow * @c: Execution context * @proto: Protocol of this flow (IP L4 protocol number) * @fside: Flowside * * Return: hash value */ static uint64_t flow_hash(const struct ctx *c, uint8_t proto, const struct flowside *fside) { struct siphash_state state = SIPHASH_INIT(c->hash_secret); ASSERT(flowside_complete(fside)); inany_siphash_feed(&state, &fside->faddr); inany_siphash_feed(&state, &fside->eaddr); return siphash_final(&state, 38, (uint64_t)proto << 40 | (uint64_t)fside->pif << 32 | fside->fport << 16 | fside->eport); } /** * flow_sidx_hash() - Calculate hash value for given side of a given flow * @c: Execution context * @sidx: Flow & side index to get hash for * * Return: hash value, of the flow & side represented by @sidx */ static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx) { const struct flow_common *f = &flow_at_sidx(sidx)->f; return flow_hash(c, FLOW_PROTO(f), &f->side[sidx.side]); } /** * flow_hash_probe_() - Find hash bucket for a flow, given hash * @hash: Raw hash value for flow & side * @sidx: Flow and side to find bucket for * * Return: If @sidx is in the hash table, its current bucket, otherwise a * suitable free bucket for it. */ static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx) { unsigned b = hash % FLOW_HASH_SIZE; /* Linear probing */ while (!flow_sidx_eq(flow_hashtab[b], FLOW_SIDX_NONE) && !flow_sidx_eq(flow_hashtab[b], sidx)) b = mod_sub(b, 1, FLOW_HASH_SIZE); return b; } /** * flow_hash_probe() - Find hash bucket for a flow * @c: Execution context * @sidx: Flow and side to find bucket for * * Return: If @sidx is in the hash table, its current bucket, otherwise a * suitable free bucket for it. */ static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx) { return flow_hash_probe_(flow_sidx_hash(c, sidx), sidx); } /** * flow_hash_insert() - Insert side of a flow into into hash table * @c: Execution context * @sidx: Flow & side index * * Return: raw (un-modded) hash value of side of flow */ uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx) { uint64_t hash = flow_sidx_hash(c, sidx); unsigned b = flow_hash_probe_(hash, sidx); flow_hashtab[b] = sidx; flow_dbg(flow_at_sidx(sidx), "hash table insert: bucket: %u", b); return hash; } /** * flow_hash_remove() - Drop side of a flow from the hash table * @c: Execution context * @sidx: Side of flow to remove */ void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx) { unsigned b = flow_hash_probe(c, sidx), s; if (flow_sidx_eq(flow_hashtab[b], FLOW_SIDX_NONE)) return; /* Redundant remove */ flow_dbg(flow_at_sidx(sidx), "hash table remove: bucket: %u", b); /* Scan the remainder of the cluster */ for (s = mod_sub(b, 1, FLOW_HASH_SIZE); !flow_sidx_eq(flow_hashtab[s], FLOW_SIDX_NONE); s = mod_sub(s, 1, FLOW_HASH_SIZE)) { unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE; if (!mod_between(h, s, b, FLOW_HASH_SIZE)) { /* flow_hashtab[s] can live in flow_hashtab[b]'s slot */ debug("hash table remove: shuffle %u -> %u", s, b); flow_hashtab[b] = flow_hashtab[s]; b = s; } } flow_hashtab[b] = FLOW_SIDX_NONE; } /** * flow_hash_lookup() - Look up a flow given addressing information * @c: Execution context * @proto: Protocol of the flow (IP L4 protocol number) * @pif: Interface of the flow * @af: Address family, AF_INET or AF_INET6 * @eaddr: Guest side endpoint address (guest local address) * @faddr: Guest side forwarding address (guest remote address) * @eport: Guest side endpoint port (guest local port) * @fport: Guest side forwarding port (guest remote port) * * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found */ flow_sidx_t flow_hash_lookup(const struct ctx *c, uint8_t proto, uint8_t pif, int af, const void *eaddr, const void *faddr, in_port_t eport, in_port_t fport) { struct flowside fside; union flow *flow; int b; flowside_from_af(&fside, pif, af, faddr, fport, eaddr, eport); b = flow_hash(c, proto, &fside) % FLOW_HASH_SIZE; while ((flow = flow_at_sidx(flow_hashtab[b])) && FLOW_PROTO(&flow->f) == proto && !flowside_eq(&flow->f.side[flow_hashtab[b].side], &fside)) b = (b + 1) % FLOW_HASH_SIZE; return flow_hashtab[b]; } /** * flow_defer_handler() - Handler for per-flow deferred and timed tasks * @c: Execution context * @now: Current timestamp */ void flow_defer_handler(const struct ctx *c, const struct timespec *now) { struct flow_free_block *free_head = NULL; unsigned *last_next = &flow_first_free; bool timer = false; unsigned idx; if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) { timer = true; flow_timer_run = *now; } for (idx = 0; idx < FLOW_MAX; idx++) { union flow *flow = &flowtab[idx]; bool closed = false; if (flow->f.type == FLOW_TYPE_NONE) { /* Start of a free block */ free_head = &flow->free; *last_next = idx; last_next = &free_head->next; /* Skip the rest of the block */ idx += free_head->n - 1; continue; } switch (flow->f.type) { case FLOW_TYPE_NONE: closed = true; break; case FLOW_TCP: closed = tcp_flow_defer(flow); break; case FLOW_TCP_SPLICE: closed = tcp_splice_flow_defer(flow); if (!closed && timer) tcp_splice_timer(c, flow); break; default: /* Assume other flow types don't need any handling */ ; } if (closed) { flow->f.type = FLOW_TYPE_NONE; if (free_head) { /* Add slot to current free block */ ASSERT(idx == FLOW_IDX(free_head) + free_head->n); free_head->n++; flow->free.n = flow->free.next = 0; } else { /* Create new free block */ free_head = &flow->free; free_head->n = 1; *last_next = idx; last_next = &free_head->next; } } else { free_head = NULL; } } *last_next = FLOW_MAX; } /** * flow_init() - Initialise flow related data structures */ void flow_init(void) { unsigned b; /* Initial state is a single free block containing the whole table */ flowtab[0].free.n = FLOW_MAX; flowtab[0].free.next = FLOW_MAX; for (b = 0; b < FLOW_HASH_SIZE; b++) flow_hashtab[b] = FLOW_SIDX_NONE; }