public inbox for passt-dev@passt.top

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
 /* SPDX-License-Identifier: GPL-2.0-or-later
 * Copyright Red Hat
 * Author: David Gibson <david@gibson.dropbear.id.au>
 *
 * Tracking for logical "flows" of packets.
 */

#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <arpa/inet.h>

#include "util.h"
#include "passt.h"
#include "siphash.h"
#include "inany.h"
#include "flow.h"
#include "flow_table.h"

const char *flow_type_str[] = {
	[FLOW_TYPE_NONE]	= "<none>",
	[FLOW_TCP]		= "TCP connection",
	[FLOW_TCP_SPLICE]	= "TCP connection (spliced)",
};
static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
	      "flow_type_str[] doesn't match enum flow_type");

const uint8_t flow_proto[] = {
	[FLOW_TCP]		= IPPROTO_TCP,
	[FLOW_TCP_SPLICE]	= IPPROTO_TCP,
};
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
	      "flow_proto[] doesn't match enum flow_type");

/* Global Flow Table */
unsigned flow_first_free;
union flow flowtab[FLOW_MAX];

/* Last time the flow timers ran */
static struct timespec flow_timer_run;

/** flow_log_ - Log flow-related message
 * @f:		flow the message is related to
 * @pri:	Log priority
 * @fmt:	Format string
 * @...:	printf-arguments
 */
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
{
	char msg[BUFSIZ];
	va_list args;

	va_start(args, fmt);
	(void)vsnprintf(msg, sizeof(msg), fmt, args);
	va_end(args);

	logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg);
}

/**
 * flow_new_dbg() - Print debug message for new flow
 * @f:		Common flow structure
 * @side:	Which side initiated the new flow
 */
void flow_new_dbg(const struct flow_common *f, unsigned side)
{
	char ebuf[INET6_ADDRSTRLEN], fbuf[INET6_ADDRSTRLEN];
	const struct flowside *fside = &f->side[side];

	flow_log_(f, LOG_DEBUG, "New %s from %s/%u: [%s]:%hu <-> [%s]:%hu",
		  flow_type_str[f->type], pif_name(fside->pif), side,
		  inet_ntop(AF_INET6, &fside->faddr, fbuf, sizeof(fbuf)),
		  fside->fport,
		  inet_ntop(AF_INET6, &fside->eaddr, ebuf, sizeof(ebuf)),
		  fside->eport);
}

/**
 * flow_fwd_dbg() - Print debug message for newly forwarded flow
 * @f:		Common flow structure
 * @side:	Which side was the flow forwarded to
 */
void flow_fwd_dbg(const struct flow_common *f, unsigned side)
{
	char ebuf[INET6_ADDRSTRLEN], fbuf[INET6_ADDRSTRLEN];
	const struct flowside *fside = &f->side[side];

	inet_ntop(AF_INET6, &fside->eaddr, ebuf, sizeof(ebuf));
	inet_ntop(AF_INET6, &fside->faddr, fbuf, sizeof(fbuf));

	flow_log_(f, LOG_DEBUG, "Forwarded to %s/%u: [%s]:%hu <-> [%s]:%hu",
		  pif_name(fside->pif), side,
		  inet_ntop(AF_INET6, &fside->faddr, fbuf, sizeof(fbuf)),
		  fside->fport,
		  inet_ntop(AF_INET6, &fside->eaddr, ebuf, sizeof(ebuf)),
		  fside->eport);
}

/** flowside_from_sock - Initialize flowside to match an existing socket
 * @fside:	flowside to initialize
 * @pif:	pif id of this flowside
 * @s:		socket
 * @fsa:	Local addr of @s as sockaddr_in or sockaddr_in6, or NULL
 * @esa:	Remote addr of @s as sockaddr_in or sockaddr_in6, or NULL
 *
 * If NULL is passed for either @fsa/@esa, we use getsockname()/getpeername() to
 * obtain the information from the @s.
 *
 * #syscalls getsockname getpeername
 */
int flowside_from_sock(struct flowside *fside, uint8_t pif, int s,
		       const void *fsa, const void *esa)
{
	struct sockaddr_storage sa;

	fside->pif = pif;

	if (!fsa) {
		socklen_t sl = sizeof(sa);
		if (getsockname(s, (struct sockaddr *)&sa, &sl) < 0)
			return -errno;
		fsa = &sa;
	}
	inany_from_sockaddr(&fside->faddr, &fside->fport,
			    (const struct sockaddr *)fsa);

	if (!esa) {
		socklen_t sl = sizeof(sa);
		if (getpeername(s, (struct sockaddr *)&sa, &sl) < 0)
			return -errno;
		esa = &sa;
	}
	inany_from_sockaddr(&fside->eaddr, &fside->eport,
			    (const struct sockaddr *)esa);

	return 0;
}

/**
 * DOC: Theory of Operation - allocation and freeing of flow entries
 *
 * Each flow takes a single slot in flowtab[].  Moving entries in that table
 * (which we used to do) is fiddly and possibly expensive: it requires updating
 * the hash table indexing flows, and may require updating epoll data which
 * references the flow by index.  However, we also want to keep the active
 * entries in the table compact where possible, because otherwise scanning
 * through the entire table becomes more expensive.  This describes the
 * compromise implemented below.
 *
 * Free blocks
 *    A "free block" is a contiguous sequence of unused (FLOW_TYPE_NONE) entries
 *    in flowtab.  The first entry in each block contains metadata, specifically
 *    the number of entries in the block, and the index of the next (non
 *    contiguous) free block (struct flow_free_block).
 *
 * Free block list
 *    flow_first_free gives the index of the first entry of the first (lowest
 *    index) free block.  Each free block has the index of the next free block,
 *    or MAX_FLOW if it is the last free block.  Together these form a linked
 *    list of free blocks, in strictly increasing order of index.
 *
 * Allocation
 *    When allocating a new flow, we always use the first entry of the first
 *    free block, that is, at index flow_first_free.  If the block has more than
 *    one entry, flow_first_free is updated to the next entry, which is updated
 *    to represent the new smaller free block.  Otherwise the free block is
 *    eliminated and flow_first_free is updated to the next free block.
 *
 * Scanning the table
 *    Theoretically, scanning the table requires FLOW_MAX iterations.  However,
 *    when we encounter the start of a free block, we can immediately skip to
 *    its end, meaning that in practice we only need (number of active
 *    connections) + (number of free blocks) iterations.
 *
 * Freeing
 *    We can only free entries when scanning the whole flow table in
 *    flow_defer_handler().  This is what lets us maintain the fee block list in
 *    index sorted order.  As we scan we keep track of whether the previous
 *    entry was in a free block or not.  If so when an entry is freed (its
 *    deferred handler returns 'true'), we add it to that free block.  Otherwise
 *    we create a new free block for it and chain it to the last free block we
 *    scanned.
 */

/**
 * flow_alloc() - Allocate a new flow
 *
 * Return: pointer to an unused flow entry, or NULL if the table is full
 */
union flow *flow_alloc(void)
{
	union flow *flow = &flowtab[flow_first_free];

	if (flow_first_free >= FLOW_MAX)
		return NULL;

	ASSERT(flow->f.type == FLOW_TYPE_NONE);
	ASSERT(flow->free.n >= 1);

	if (flow->free.n > 1) {
		/* Use one entry from the block */
		union flow *next = &flowtab[++flow_first_free];

		ASSERT(FLOW_IDX(next) < FLOW_MAX);
		ASSERT(next->f.type == FLOW_TYPE_NONE);
		ASSERT(next->free.n == 0);

		next->free.n = flow->free.n - 1;
		next->free.next = flow->free.next;
	} else {
		/* Use the entire block */
		flow_first_free = flow->free.next;
	}

	memset(flow, 0, sizeof(*flow));
	return flow;
}

/**
 * flow_alloc_cancel() - Free a newly allocated flow
 * @flow:	Flow to deallocate
 *
 * @flow must be the last flow allocated by flow_alloc()
 */
void flow_alloc_cancel(union flow *flow)
{
	ASSERT(flow_first_free > FLOW_IDX(flow));

	flow->f.type = FLOW_TYPE_NONE;
	/* Put it back in a length 1 free block, don't attempt to fully reverse
	 * flow_alloc()s steps.  This will get folded together the next time
	 * flow_defer_handler runs anyway() */
	flow->free.n = 1;
	flow->free.next = flow_first_free;
	flow_first_free = FLOW_IDX(flow);
}

/**
 * flow_hash() - Calculate hash value for one side of a flow
 * @c:		Execution context
 * @proto:	Protocol of this flow (IP L4 protocol number)
 * @fside:	Flowside
 *
 * Return: hash value
 */
uint64_t flow_hash(const struct ctx *c, uint8_t proto,
		   const struct flowside *fside)
{
	struct siphash_state state = SIPHASH_INIT(c->hash_secret);

	ASSERT(flowside_complete(fside));
	inany_siphash_feed(&state, &fside->faddr);
	inany_siphash_feed(&state, &fside->eaddr);
	return siphash_final(&state, 38, (uint64_t)proto << 40 |
			     (uint64_t)fside->pif << 32 |
			     fside->fport << 16 | fside->eport);
}

/**
 * flow_defer_handler() - Handler for per-flow deferred and timed tasks
 * @c:		Execution context
 * @now:	Current timestamp
 */
void flow_defer_handler(const struct ctx *c, const struct timespec *now)
{
	struct flow_free_block *free_head = NULL;
	unsigned *last_next = &flow_first_free;
	bool timer = false;
	unsigned idx;

	if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
		timer = true;
		flow_timer_run = *now;
	}

	for (idx = 0; idx < FLOW_MAX; idx++) {
		union flow *flow = &flowtab[idx];
		bool closed = false;

		if (flow->f.type == FLOW_TYPE_NONE) {
			/* Start of a free block */
			free_head = &flow->free;
			*last_next = idx;
			last_next = &free_head->next;
			/* Skip the rest of the block */
			idx += free_head->n - 1;
			continue;
		}

		
		switch (flow->f.type) {
		case FLOW_TYPE_NONE:
			closed = true;
			break;
		case FLOW_TCP:
			closed = tcp_flow_defer(flow);
			break;
		case FLOW_TCP_SPLICE:
			closed = tcp_splice_flow_defer(flow);
			if (!closed && timer)
				tcp_splice_timer(c, flow);
			break;
		default:
			/* Assume other flow types don't need any handling */
			;
		}

		if (closed) {
			flow->f.type = FLOW_TYPE_NONE;

			if (free_head) {
				/* Add slot to current free block */
				ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
				free_head->n++;
				flow->free.n = flow->free.next = 0;
			} else {
				/* Create new free block */
				free_head = &flow->free;
				free_head->n = 1;
				*last_next = idx;
				last_next = &free_head->next;
			}
		} else {
			free_head = NULL;
		}
	}

	*last_next = FLOW_MAX;
}

/**
 * flow_init() - Initialise flow related data structures
 */
void flow_init(void)
{
	/* Initial state is a single free block containing the whole table */
	flowtab[0].free.n = FLOW_MAX;
	flowtab[0].free.next = FLOW_MAX;
}