/* SPDX-License-Identifier: GPL-2.0-or-later
 * Copyright Red Hat
 * Author: David Gibson <david@gibson.dropbear.id.au>
 *
 * Tracking for logical "flows" of packets.
 */

#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>

#include "util.h"
#include "passt.h"
#include "siphash.h"
#include "inany.h"
#include "flow.h"
#include "flow_table.h"

const char *flow_type_str[] = {
	[FLOW_TYPE_NONE]	= "<none>",
	[FLOW_TCP]		= "TCP connection",
	[FLOW_TCP_SPLICE]	= "TCP connection (spliced)",
};
static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
	      "flow_type_str[] doesn't match enum flow_type");

const uint8_t flow_proto[] = {
	[FLOW_TCP]		= IPPROTO_TCP,
	[FLOW_TCP_SPLICE]	= IPPROTO_TCP,
};
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
	      "flow_proto[] doesn't match enum flow_type");

/* Global Flow Table */
unsigned flow_first_free;
union flow flowtab[FLOW_MAX];

/* Last time the flow timers ran */
static struct timespec flow_timer_run;

/* Hash table to index it */
#define FLOW_HASH_LOAD		70		/* % */
#define FLOW_HASH_SIZE		((2 * FLOW_MAX * 100 / FLOW_HASH_LOAD))

/* Table for lookup from flowside information */
static flow_sidx_t flow_hashtab[FLOW_HASH_SIZE];

static_assert(ARRAY_SIZE(flow_hashtab) >= 2 * FLOW_MAX,
"Safe linear probing requires hash table with more entries than the number of sides in the flow table");

/** flow_log_ - Log flow-related message
 * @f:		flow the message is related to
 * @pri:	Log priority
 * @fmt:	Format string
 * @...:	printf-arguments
 */
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
{
	char msg[BUFSIZ];
	va_list args;

	va_start(args, fmt);
	(void)vsnprintf(msg, sizeof(msg), fmt, args);
	va_end(args);

	logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg);
}

/**
 * flow_new_dbg() - Print debug message for new flow
 * @f:		Common flow structure
 * @side:	Which side initiated the new flow
 */
void flow_new_dbg(const struct flow_common *f, unsigned side)
{
	char ebuf[INET6_ADDRSTRLEN], fbuf[INET6_ADDRSTRLEN];
	const struct flowside *fside = &f->side[side];

	flow_log_(f, LOG_DEBUG, "New %s from %s/%u: [%s]:%hu <-> [%s]:%hu",
		  flow_type_str[f->type], pif_name(fside->pif), side,
		  inet_ntop(AF_INET6, &fside->faddr, fbuf, sizeof(fbuf)),
		  fside->fport,
		  inet_ntop(AF_INET6, &fside->eaddr, ebuf, sizeof(ebuf)),
		  fside->eport);
}

/**
 * flow_fwd_dbg() - Print debug message for newly forwarded flow
 * @f:		Common flow structure
 * @side:	Which side was the flow forwarded to
 */
void flow_fwd_dbg(const struct flow_common *f, unsigned side)
{
	char ebuf[INET6_ADDRSTRLEN], fbuf[INET6_ADDRSTRLEN];
	const struct flowside *fside = &f->side[side];

	inet_ntop(AF_INET6, &fside->eaddr, ebuf, sizeof(ebuf));
	inet_ntop(AF_INET6, &fside->faddr, fbuf, sizeof(fbuf));

	flow_log_(f, LOG_DEBUG, "Forwarded to %s/%u: [%s]:%hu <-> [%s]:%hu",
		  pif_name(fside->pif), side,
		  inet_ntop(AF_INET6, &fside->faddr, fbuf, sizeof(fbuf)),
		  fside->fport,
		  inet_ntop(AF_INET6, &fside->eaddr, ebuf, sizeof(ebuf)),
		  fside->eport);
}

/** flowside_from_sock - Initialize flowside to match an existing socket
 * @fside:	flowside to initialize
 * @pif:	pif id of this flowside
 * @s:		socket
 * @fsa:	Local addr of @s as sockaddr_in or sockaddr_in6, or NULL
 * @esa:	Remote addr of @s as sockaddr_in or sockaddr_in6, or NULL
 *
 * If NULL is passed for either @fsa/@esa, we use getsockname()/getpeername() to
 * obtain the information from the @s.
 *
 * #syscalls getsockname getpeername
 */
int flowside_from_sock(struct flowside *fside, uint8_t pif, int s,
		       const void *fsa, const void *esa)
{
	struct sockaddr_storage sa;

	fside->pif = pif;

	if (!fsa) {
		socklen_t sl = sizeof(sa);
		if (getsockname(s, (struct sockaddr *)&sa, &sl) < 0)
			return -errno;
		fsa = &sa;
	}
	inany_from_sockaddr(&fside->faddr, &fside->fport,
			    (const struct sockaddr *)fsa);

	if (!esa) {
		socklen_t sl = sizeof(sa);
		if (getpeername(s, (struct sockaddr *)&sa, &sl) < 0)
			return -errno;
		esa = &sa;
	}
	inany_from_sockaddr(&fside->eaddr, &fside->eport,
			    (const struct sockaddr *)esa);

	return 0;
}

/**
 * DOC: Theory of Operation - allocation and freeing of flow entries
 *
 * Each flow takes a single slot in flowtab[].  Moving entries in that table
 * (which we used to do) is fiddly and possibly expensive: it requires updating
 * the hash table indexing flows, and may require updating epoll data which
 * references the flow by index.  However, we also want to keep the active
 * entries in the table compact where possible, because otherwise scanning
 * through the entire table becomes more expensive.  This describes the
 * compromise implemented below.
 *
 * Free blocks
 *    A "free block" is a contiguous sequence of unused (FLOW_TYPE_NONE) entries
 *    in flowtab.  The first entry in each block contains metadata, specifically
 *    the number of entries in the block, and the index of the next (non
 *    contiguous) free block (struct flow_free_block).
 *
 * Free block list
 *    flow_first_free gives the index of the first entry of the first (lowest
 *    index) free block.  Each free block has the index of the next free block,
 *    or MAX_FLOW if it is the last free block.  Together these form a linked
 *    list of free blocks, in strictly increasing order of index.
 *
 * Allocation
 *    When allocating a new flow, we always use the first entry of the first
 *    free block, that is, at index flow_first_free.  If the block has more than
 *    one entry, flow_first_free is updated to the next entry, which is updated
 *    to represent the new smaller free block.  Otherwise the free block is
 *    eliminated and flow_first_free is updated to the next free block.
 *
 * Scanning the table
 *    Theoretically, scanning the table requires FLOW_MAX iterations.  However,
 *    when we encounter the start of a free block, we can immediately skip to
 *    its end, meaning that in practice we only need (number of active
 *    connections) + (number of free blocks) iterations.
 *
 * Freeing
 *    We can only free entries when scanning the whole flow table in
 *    flow_defer_handler().  This is what lets us maintain the fee block list in
 *    index sorted order.  As we scan we keep track of whether the previous
 *    entry was in a free block or not.  If so when an entry is freed (its
 *    deferred handler returns 'true'), we add it to that free block.  Otherwise
 *    we create a new free block for it and chain it to the last free block we
 *    scanned.
 */

/**
 * flow_alloc() - Allocate a new flow
 *
 * Return: pointer to an unused flow entry, or NULL if the table is full
 */
union flow *flow_alloc(void)
{
	union flow *flow = &flowtab[flow_first_free];

	if (flow_first_free >= FLOW_MAX)
		return NULL;

	ASSERT(flow->f.type == FLOW_TYPE_NONE);
	ASSERT(flow->free.n >= 1);

	if (flow->free.n > 1) {
		/* Use one entry from the block */
		union flow *next = &flowtab[++flow_first_free];

		ASSERT(FLOW_IDX(next) < FLOW_MAX);
		ASSERT(next->f.type == FLOW_TYPE_NONE);
		ASSERT(next->free.n == 0);

		next->free.n = flow->free.n - 1;
		next->free.next = flow->free.next;
	} else {
		/* Use the entire block */
		flow_first_free = flow->free.next;
	}

	memset(flow, 0, sizeof(*flow));
	return flow;
}

/**
 * flow_alloc_cancel() - Free a newly allocated flow
 * @flow:	Flow to deallocate
 *
 * @flow must be the last flow allocated by flow_alloc()
 */
void flow_alloc_cancel(union flow *flow)
{
	ASSERT(flow_first_free > FLOW_IDX(flow));

	flow->f.type = FLOW_TYPE_NONE;
	/* Put it back in a length 1 free block, don't attempt to fully reverse
	 * flow_alloc()s steps.  This will get folded together the next time
	 * flow_defer_handler runs anyway() */
	flow->free.n = 1;
	flow->free.next = flow_first_free;
	flow_first_free = FLOW_IDX(flow);
}

/**
 * flow_hash() - Calculate hash value for one side of a flow
 * @c:		Execution context
 * @proto:	Protocol of this flow (IP L4 protocol number)
 * @fside:	Flowside
 *
 * Return: hash value
 */
static uint64_t flow_hash(const struct ctx *c, uint8_t proto,
			  const struct flowside *fside)
{
	struct siphash_state state = SIPHASH_INIT(c->hash_secret);

	ASSERT(flowside_complete(fside));
	inany_siphash_feed(&state, &fside->faddr);
	inany_siphash_feed(&state, &fside->eaddr);
	return siphash_final(&state, 38, (uint64_t)proto << 40 |
			     (uint64_t)fside->pif << 32 |
			     fside->fport << 16 | fside->eport);
}

/**
 * flow_sidx_hash() - Calculate hash value for given side of a given flow
 * @c:		Execution context
 * @sidx:	Flow & side index to get hash for
 *
 * Return: hash value, of the flow & side represented by @sidx
 */
static uint64_t flow_sidx_hash(const struct ctx *c, flow_sidx_t sidx)
{
	const struct flow_common *f = &flow_at_sidx(sidx)->f;
	return flow_hash(c, FLOW_PROTO(f), &f->side[sidx.side]);
}

/**
 * flow_hash_probe_() - Find hash bucket for a flow, given hash
 * @hash:	Raw hash value for flow & side
 * @sidx:	Flow and side to find bucket for
 *
 * Return: If @sidx is in the hash table, its current bucket, otherwise a
 *         suitable free bucket for it.
 */
static inline unsigned flow_hash_probe_(uint64_t hash, flow_sidx_t sidx)
{
	unsigned b = hash % FLOW_HASH_SIZE;

	/* Linear probing */
	while (!flow_sidx_eq(flow_hashtab[b], FLOW_SIDX_NONE) &&
	       !flow_sidx_eq(flow_hashtab[b], sidx))
		b = mod_sub(b, 1, FLOW_HASH_SIZE);

	return b;
}

/**
 * flow_hash_probe() - Find hash bucket for a flow
 * @c:		Execution context
 * @sidx:	Flow and side to find bucket for
 *
 * Return: If @sidx is in the hash table, its current bucket, otherwise a
 *         suitable free bucket for it.
 */
static inline unsigned flow_hash_probe(const struct ctx *c, flow_sidx_t sidx)
{
	return flow_hash_probe_(flow_sidx_hash(c, sidx), sidx);
}

/**
 * flow_hash_insert() - Insert side of a flow into into hash table
 * @c:		Execution context
 * @sidx:	Flow & side index
 *
 * Return: raw (un-modded) hash value of side of flow
 */
uint64_t flow_hash_insert(const struct ctx *c, flow_sidx_t sidx)
{
	uint64_t hash = flow_sidx_hash(c, sidx);
	unsigned b = flow_hash_probe_(hash, sidx);

	flow_hashtab[b] = sidx;
	flow_dbg(flow_at_sidx(sidx), "hash table insert: bucket: %u", b);

	return hash;
}

/**
 * flow_hash_remove() - Drop side of a flow from the hash table
 * @c:		Execution context
 * @sidx:	Side of flow to remove
 */
void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx)
{
	unsigned b = flow_hash_probe(c, sidx), s;

	if (flow_sidx_eq(flow_hashtab[b], FLOW_SIDX_NONE))
		return; /* Redundant remove */

	flow_dbg(flow_at_sidx(sidx), "hash table remove: bucket: %u", b);

	/* Scan the remainder of the cluster */
	for (s = mod_sub(b, 1, FLOW_HASH_SIZE);
	     !flow_sidx_eq(flow_hashtab[s], FLOW_SIDX_NONE);
	     s = mod_sub(s, 1, FLOW_HASH_SIZE)) {
		unsigned h = flow_sidx_hash(c, flow_hashtab[s]) % FLOW_HASH_SIZE;

		if (!mod_between(h, s, b, FLOW_HASH_SIZE)) {
			/* flow_hashtab[s] can live in flow_hashtab[b]'s slot */
			debug("hash table remove: shuffle %u -> %u", s, b);
			flow_hashtab[b] = flow_hashtab[s];
			b = s;
		}
	}

	flow_hashtab[b] = FLOW_SIDX_NONE;
}

/**
 * flow_hash_lookup() - Look up a flow given addressing information
 * @c:		Execution context
 * @proto:	Protocol of the flow (IP L4 protocol number)
 * @pif:	Interface of the flow
 * @af:		Address family, AF_INET or AF_INET6
 * @eaddr:	Guest side endpoint address (guest local address)
 * @faddr:	Guest side forwarding address (guest remote address)
 * @eport:	Guest side endpoint port (guest local port)
 * @fport:	Guest side forwarding port (guest remote port)
 *
 * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
 */
flow_sidx_t flow_hash_lookup(const struct ctx *c, uint8_t proto, uint8_t pif,
			     int af, const void *eaddr, const void *faddr,
			     in_port_t eport, in_port_t fport)
{
	struct flowside fside;
	union flow *flow;
	int b;

	flowside_from_af(&fside, pif, af, faddr, fport, eaddr, eport);

	b = flow_hash(c, proto, &fside) % FLOW_HASH_SIZE;
	while ((flow = flow_at_sidx(flow_hashtab[b])) &&
	       FLOW_PROTO(&flow->f) == proto &&
	       !flowside_eq(&flow->f.side[flow_hashtab[b].side], &fside))
		b = (b + 1) % FLOW_HASH_SIZE;

	return flow_hashtab[b];
}

/**
 * flow_defer_handler() - Handler for per-flow deferred and timed tasks
 * @c:		Execution context
 * @now:	Current timestamp
 */
void flow_defer_handler(const struct ctx *c, const struct timespec *now)
{
	struct flow_free_block *free_head = NULL;
	unsigned *last_next = &flow_first_free;
	bool timer = false;
	unsigned idx;

	if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
		timer = true;
		flow_timer_run = *now;
	}

	for (idx = 0; idx < FLOW_MAX; idx++) {
		union flow *flow = &flowtab[idx];
		bool closed = false;

		if (flow->f.type == FLOW_TYPE_NONE) {
			/* Start of a free block */
			free_head = &flow->free;
			*last_next = idx;
			last_next = &free_head->next;
			/* Skip the rest of the block */
			idx += free_head->n - 1;
			continue;
		}

		
		switch (flow->f.type) {
		case FLOW_TYPE_NONE:
			closed = true;
			break;
		case FLOW_TCP:
			closed = tcp_flow_defer(flow);
			break;
		case FLOW_TCP_SPLICE:
			closed = tcp_splice_flow_defer(flow);
			if (!closed && timer)
				tcp_splice_timer(c, flow);
			break;
		default:
			/* Assume other flow types don't need any handling */
			;
		}

		if (closed) {
			flow->f.type = FLOW_TYPE_NONE;

			if (free_head) {
				/* Add slot to current free block */
				ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
				free_head->n++;
				flow->free.n = flow->free.next = 0;
			} else {
				/* Create new free block */
				free_head = &flow->free;
				free_head->n = 1;
				*last_next = idx;
				last_next = &free_head->next;
			}
		} else {
			free_head = NULL;
		}
	}

	*last_next = FLOW_MAX;
}

/**
 * flow_init() - Initialise flow related data structures
 */
void flow_init(void)
{
	unsigned b;

	/* Initial state is a single free block containing the whole table */
	flowtab[0].free.n = FLOW_MAX;
	flowtab[0].free.next = FLOW_MAX;

	for (b = 0; b < FLOW_HASH_SIZE; b++)
		flow_hashtab[b] = FLOW_SIDX_NONE;
}