1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
| | /* SPDX-License-Identifier: GPL-2.0-or-later
* Copyright Red Hat
* Author: David Gibson <david@gibson.dropbear.id.au>
*
* Tracking for logical "flows" of packets.
*/
#include <stdint.h>
#include <unistd.h>
#include <string.h>
#include "util.h"
#include "passt.h"
#include "siphash.h"
#include "inany.h"
#include "flow.h"
#include "flow_table.h"
const char *flow_type_str[] = {
[FLOW_TYPE_NONE] = "<none>",
[FLOW_TCP] = "TCP connection",
[FLOW_TCP_SPLICE] = "TCP connection (spliced)",
};
static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
"flow_type_str[] doesn't match enum flow_type");
/* Global Flow Table */
unsigned flow_first_free;
union flow flowtab[FLOW_MAX];
/* Last time the flow timers ran */
static struct timespec flow_timer_run;
/** flow_log_ - Log flow-related message
* @f: flow the message is related to
* @pri: Log priority
* @fmt: Format string
* @...: printf-arguments
*/
void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...)
{
char msg[BUFSIZ];
va_list args;
va_start(args, fmt);
(void)vsnprintf(msg, sizeof(msg), fmt, args);
va_end(args);
logmsg(pri, "Flow %u (%s): %s", flow_idx(f), FLOW_TYPE(f), msg);
}
/**
* DOC: Theory of Operation - allocation and freeing of flow entries
*
* Each flow takes a single slot in flowtab[]. Moving entries in that table
* (which we used to do) is fiddly and possibly expensive: it requires updating
* the hash table indexing flows, and may require updating epoll data which
* references the flow by index. However, we also want to keep the active
* entries in the table compact where possible, because otherwise scanning
* through the entire table becomes more expensive. This describes the
* compromise implemented below.
*
* Free blocks
* A "free block" is a contiguous sequence of unused (FLOW_TYPE_NONE) entries
* in flowtab. The first entry in each block contains metadata, specifically
* the number of entries in the block, and the index of the next (non
* contiguous) free block (struct flow_free_block).
*
* Free block list
* flow_first_free gives the index of the first entry of the first (lowest
* index) free block. Each free block has the index of the next free block,
* or MAX_FLOW if it is the last free block. Together these form a linked
* list of free blocks, in strictly increasing order of index.
*
* Allocation
* When allocating a new flow, we always use the first entry of the first
* free block, that is, at index flow_first_free. If the block has more than
* one entry, flow_first_free is updated to the next entry, which is updated
* to represent the new smaller free block. Otherwise the free block is
* eliminated and flow_first_free is updated to the next free block.
*
* Scanning the table
* Theoretically, scanning the table requires FLOW_MAX iterations. However,
* when we encounter the start of a free block, we can immediately skip to
* its end, meaning that in practice we only need (number of active
* connections) + (number of free blocks) iterations.
*
* Freeing
* We can only free entries when scanning the whole flow table in
* flow_defer_handler(). This is what lets us maintain the fee block list in
* index sorted order. As we scan we keep track of whether the previous
* entry was in a free block or not. If so when an entry is freed (its
* deferred handler returns 'true'), we add it to that free block. Otherwise
* we create a new free block for it and chain it to the last free block we
* scanned.
*/
/**
* flow_alloc() - Allocate a new flow
*
* Return: pointer to an unused flow entry, or NULL if the table is full
*/
union flow *flow_alloc(void)
{
union flow *flow = &flowtab[flow_first_free];
if (flow_first_free >= FLOW_MAX)
return NULL;
ASSERT(flow->f.type == FLOW_TYPE_NONE);
ASSERT(flow->free.n >= 1);
if (flow->free.n > 1) {
/* Use one entry from the block */
union flow *next = &flowtab[++flow_first_free];
ASSERT(FLOW_IDX(next) < FLOW_MAX);
ASSERT(next->f.type == FLOW_TYPE_NONE);
ASSERT(next->free.n == 0);
next->free.n = flow->free.n - 1;
next->free.next = flow->free.next;
} else {
/* Use the entire block */
flow_first_free = flow->free.next;
}
memset(flow, 0, sizeof(*flow));
return flow;
}
/**
* flow_alloc_cancel() - Free a newly allocated flow
* @flow: Flow to deallocate
*
* @flow must be the last flow allocated by flow_alloc()
*/
void flow_alloc_cancel(union flow *flow)
{
ASSERT(flow_first_free > FLOW_IDX(flow));
flow->f.type = FLOW_TYPE_NONE;
/* Put it back in a length 1 free block, don't attempt to fully reverse
* flow_alloc()s steps. This will get folded together the next time
* flow_defer_handler runs anyway() */
flow->free.n = 1;
flow->free.next = flow_first_free;
flow_first_free = FLOW_IDX(flow);
}
/**
* flow_defer_handler() - Handler for per-flow deferred and timed tasks
* @c: Execution context
* @now: Current timestamp
*/
void flow_defer_handler(const struct ctx *c, const struct timespec *now)
{
struct flow_free_block *free_head = NULL;
unsigned *last_next = &flow_first_free;
bool timer = false;
unsigned idx;
if (timespec_diff_ms(now, &flow_timer_run) >= FLOW_TIMER_INTERVAL) {
timer = true;
flow_timer_run = *now;
}
for (idx = 0; idx < FLOW_MAX; idx++) {
union flow *flow = &flowtab[idx];
bool closed = false;
if (flow->f.type == FLOW_TYPE_NONE) {
/* Start of a free block */
free_head = &flow->free;
*last_next = idx;
last_next = &free_head->next;
/* Skip the rest of the block */
idx += free_head->n - 1;
continue;
}
switch (flow->f.type) {
case FLOW_TYPE_NONE:
closed = true;
break;
case FLOW_TCP:
closed = tcp_flow_defer(flow);
break;
case FLOW_TCP_SPLICE:
closed = tcp_splice_flow_defer(flow);
if (!closed && timer)
tcp_splice_timer(c, flow);
break;
default:
/* Assume other flow types don't need any handling */
;
}
if (closed) {
flow->f.type = FLOW_TYPE_NONE;
if (free_head) {
/* Add slot to current free block */
ASSERT(idx == FLOW_IDX(free_head) + free_head->n);
free_head->n++;
flow->free.n = flow->free.next = 0;
} else {
/* Create new free block */
free_head = &flow->free;
free_head->n = 1;
*last_next = idx;
last_next = &free_head->next;
}
} else {
free_head = NULL;
}
}
*last_next = FLOW_MAX;
}
/**
* flow_init() - Initialise flow related data structures
*/
void flow_init(void)
{
/* Initial state is a single free block containing the whole table */
flowtab[0].free.n = FLOW_MAX;
flowtab[0].free.next = FLOW_MAX;
}
|