From: David Gibson <david@gibson.dropbear.id.au>
To: Stefano Brivio <sbrivio@redhat.com>, passt-dev@passt.top
Cc: jmaloy@redhat.com, David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v7 20/27] udp: Create flows for datagrams from originating sockets
Date: Fri, 5 Jul 2024 12:07:17 +1000 [thread overview]
Message-ID: <20240705020724.3447719-21-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20240705020724.3447719-1-david@gibson.dropbear.id.au>
This implements the first steps of tracking UDP packets with the flow table
rather than it's own (buggy) set of port maps. Specifically we create flow
table entries for datagrams received from a socket (PIF_HOST or
PIF_SPLICE).
When splitting datagrams from sockets into batches, we group by the flow
as well as splicesrc. This may result in smaller batches, but makes things
easier down the line. We can re-optimise this later if necessary. For now
we don't do anything else with the flow, not even match reply packets to
the same flow.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
Makefile | 2 +-
flow.c | 31 ++++++++++
flow.h | 4 ++
flow_table.h | 14 +++++
udp.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++--
udp_flow.h | 25 ++++++++
6 files changed, 240 insertions(+), 5 deletions(-)
create mode 100644 udp_flow.h
diff --git a/Makefile b/Makefile
index 09fc461d..92cbd5a6 100644
--- a/Makefile
+++ b/Makefile
@@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
- udp.h util.h
+ udp.h udp_flow.h util.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
diff --git a/flow.c b/flow.c
index 218033ae..0cb9495b 100644
--- a/flow.c
+++ b/flow.c
@@ -37,6 +37,7 @@ const char *flow_type_str[] = {
[FLOW_TCP_SPLICE] = "TCP connection (spliced)",
[FLOW_PING4] = "ICMP ping sequence",
[FLOW_PING6] = "ICMPv6 ping sequence",
+ [FLOW_UDP] = "UDP flow",
};
static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
"flow_type_str[] doesn't match enum flow_type");
@@ -46,6 +47,7 @@ const uint8_t flow_proto[] = {
[FLOW_TCP_SPLICE] = IPPROTO_TCP,
[FLOW_PING4] = IPPROTO_ICMP,
[FLOW_PING6] = IPPROTO_ICMPV6,
+ [FLOW_UDP] = IPPROTO_UDP,
};
static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
"flow_proto[] doesn't match enum flow_type");
@@ -700,6 +702,31 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
return flowside_lookup(c, proto, pif, &fside);
}
+/**
+ * flow_lookup_sa() - Look up a flow given and endpoint socket address
+ * @c: Execution context
+ * @proto: Protocol of the flow (IP L4 protocol number)
+ * @pif: Interface of the flow
+ * @esa: Socket address of the endpoint
+ * @fport: Forwarding port number
+ *
+ * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
+ */
+flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
+ const void *esa, in_port_t fport)
+{
+ struct flowside fside = {
+ .fport = fport,
+ };
+
+ inany_from_sockaddr(&fside.eaddr, &fside.eport, esa);
+ if (inany_v4(&fside.eaddr))
+ fside.faddr = inany_any4;
+ else
+ fside.faddr = inany_any6;
+ return flowside_lookup(c, proto, pif, &fside);
+}
+
/**
* flow_defer_handler() - Handler for per-flow deferred and timed tasks
* @c: Execution context
@@ -779,6 +806,10 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
if (timer)
closed = icmp_ping_timer(c, &flow->ping, now);
break;
+ case FLOW_UDP:
+ if (timer)
+ closed = udp_flow_timer(c, &flow->udp, now);
+ break;
default:
/* Assume other flow types don't need any handling */
;
diff --git a/flow.h b/flow.h
index e27f99be..3752e5ee 100644
--- a/flow.h
+++ b/flow.h
@@ -115,6 +115,8 @@ enum flow_type {
FLOW_PING4,
/* ICMPv6 echo requests from guest to host and matching replies back */
FLOW_PING6,
+ /* UDP pseudo-connection */
+ FLOW_UDP,
FLOW_NUM_TYPES,
};
@@ -238,6 +240,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
uint8_t proto, uint8_t pif, sa_family_t af,
const void *eaddr, const void *faddr,
in_port_t eport, in_port_t fport);
+flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
+ const void *esa, in_port_t fport);
union flow;
diff --git a/flow_table.h b/flow_table.h
index 457f27b1..3fbc7c8d 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -9,6 +9,7 @@
#include "tcp_conn.h"
#include "icmp_flow.h"
+#include "udp_flow.h"
/**
* struct flow_free_cluster - Information about a cluster of free entries
@@ -35,6 +36,7 @@ union flow {
struct tcp_tap_conn tcp;
struct tcp_splice_conn tcp_splice;
struct icmp_ping_flow ping;
+ struct udp_flow udp;
};
/* Global Flow Table */
@@ -78,6 +80,18 @@ static inline union flow *flow_at_sidx(flow_sidx_t sidx)
return FLOW(sidx.flow);
}
+/** flow_sidx_opposite - Get the other side of the same flow
+ * @sidx: Flow & side index
+ *
+ * Return: sidx for the other side of the same flow as @sidx
+ */
+static inline flow_sidx_t flow_sidx_opposite(flow_sidx_t sidx)
+{
+ if (!flow_sidx_valid(sidx))
+ return FLOW_SIDX_NONE;
+ return (flow_sidx_t){.flow = sidx.flow, .side = !sidx.side};
+}
+
/** flow_sidx_t - Index of one side of a flow from common structure
* @f: Common flow fields pointer
* @side: Which side to refer to (0 or 1)
diff --git a/udp.c b/udp.c
index 6427b9ce..daf4fe26 100644
--- a/udp.c
+++ b/udp.c
@@ -15,6 +15,30 @@
/**
* DOC: Theory of Operation
*
+ * UDP Flows
+ * =========
+ *
+ * UDP doesn't have true connections, but many protocols use a connection-like
+ * format. The flow is initiated by a client sending a datagram from a port of
+ * its choosing (usually ephemeral) to a specific port (usually well known) on a
+ * server. Both client and server address must be unicast. The server sends
+ * replies using the same addresses & ports with src/dest swapped.
+ *
+ * We track pseudo-connections of this type as flow table entries of type
+ * FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts,
+ * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds.
+ *
+ * NOTE: This won't handle multicast protocols, or some protocols with different
+ * port usage. We'll need specific logic if we want to handle those.
+ *
+ * "Listening" sockets
+ * ===================
+ *
+ * UDP doesn't use listen(), but we consider long term sockets which are allowed
+ * to create new flows "listening" by analogy with TCP.
+ *
+ * Port tracking
+ * =============
*
* For UDP, a reduced version of port-based connection tracking is implemented
* with two purposes:
@@ -121,6 +145,7 @@
#include "tap.h"
#include "pcap.h"
#include "log.h"
+#include "flow_table.h"
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
@@ -199,6 +224,7 @@ static struct ethhdr udp6_eth_hdr;
* @taph: Tap backend specific header
* @s_in: Source socket address, filled in by recvmmsg()
* @splicesrc: Source port for splicing, or -1 if not spliceable
+ * @tosidx: sidx for the destination side of this datagram's flow
*/
static struct udp_meta_t {
struct ipv6hdr ip6h;
@@ -207,6 +233,7 @@ static struct udp_meta_t {
union sockaddr_inany s_in;
int splicesrc;
+ flow_sidx_t tosidx;
}
#ifdef __AVX2__
__attribute__ ((aligned(32)))
@@ -490,6 +517,115 @@ static int udp_mmh_splice_port(union epoll_ref ref, const struct mmsghdr *mmh)
return -1;
}
+/**
+ * udp_at_sidx() - Get UDP specific flow at given sidx
+ * @sidx: Flow and side to retrieve
+ *
+ * Return: UDP specific flow at @sidx, or NULL of @sidx is invalid. Asserts if
+ * the flow at @sidx is not FLOW_UDP.
+ */
+struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
+{
+ union flow *flow = flow_at_sidx(sidx);
+
+ if (!flow)
+ return NULL;
+
+ ASSERT(flow->f.type == FLOW_UDP);
+ return &flow->udp;
+}
+
+/*
+ * udp_flow_close() - Close and clean up UDP flow
+ * @c: Execution context
+ * @uflow: UDP flow
+ */
+static void udp_flow_close(const struct ctx *c, const struct udp_flow *uflow)
+{
+ flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
+}
+
+/**
+ * udp_flow_new() - Common setup for a new UDP flow
+ * @c: Execution context
+ * @flow: Initiated flow
+ * @now: Timestamp
+ *
+ * Return: UDP specific flow, if successful, NULL on failure
+ */
+static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
+ const struct timespec *now)
+{
+ const struct flowside *ini = &flow->f.side[INISIDE];
+ struct udp_flow *uflow = NULL;
+
+ if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
+ flow_dbg(flow, "Invalid endpoint to initiate UDP flow");
+ goto cancel;
+ }
+
+ if (!flow_target(c, flow, IPPROTO_UDP))
+ goto cancel;
+
+ uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
+ uflow->ts = now->tv_sec;
+
+ flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
+ FLOW_ACTIVATE(uflow);
+
+ return FLOW_SIDX(uflow, TGTSIDE);
+
+cancel:
+ if (uflow)
+ udp_flow_close(c, uflow);
+ flow_alloc_cancel(flow);
+ return FLOW_SIDX_NONE;
+
+}
+
+/**
+ * udp_flow_from_sock() - Find or create UDP flow for "listening" socket
+ * @c: Execution context
+ * @ref: epoll reference of the receiving socket
+ * @meta: Metadata buffer for the datagram
+ * @now: Timestamp
+ *
+ * Return: sidx for the destination side of the flow for this packet, or
+ * FLOW_SIDX_NONE if we couldn't find or create a flow.
+ */
+static flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+ struct udp_meta_t *meta,
+ const struct timespec *now)
+{
+ struct udp_flow *uflow;
+ union flow *flow;
+ flow_sidx_t sidx;
+
+ ASSERT(ref.type == EPOLL_TYPE_UDP);
+
+ /* FIXME: Match reply packets to their flow as well */
+ if (!ref.udp.orig)
+ return FLOW_SIDX_NONE;
+
+ sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, &meta->s_in, ref.udp.port);
+ if ((uflow = udp_at_sidx(sidx))) {
+ uflow->ts = now->tv_sec;
+ return flow_sidx_opposite(sidx);
+ }
+
+ if (!(flow = flow_alloc())) {
+ char sastr[SOCKADDR_STRLEN];
+
+ debug("Couldn't allocate flow for UDP datagram from %s %s",
+ pif_name(ref.udp.pif),
+ sockaddr_ntop(&meta->s_in, sastr, sizeof(sastr)));
+ return FLOW_SIDX_NONE;
+ }
+
+ flow_initiate_sa(flow, ref.udp.pif, &meta->s_in, ref.udp.port);
+ return udp_flow_new(c, flow, now);
+}
+
/**
* udp_splice_prepare() - Prepare one datagram for splicing
* @mmh: Receiving mmsghdr array
@@ -786,12 +922,15 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve
dstport += c->udp.fwd_in.f.delta[dstport];
/* We divide datagrams into batches based on how we need to send them,
- * determined by udp_meta[i].splicesrc. To avoid either two passes
- * through the array, or recalculating splicesrc for a single entry, we
- * have to populate it one entry *ahead* of the loop counter.
+ * determined by udp_meta[i].splicesrc and udp_meta[i].tosidx. To avoid
+ * either two passes through the array, or recalculating splicesrc and
+ * tosidxfor a single entry, we have to populate it one entry *ahead* of
+ * the loop counter.
*/
udp_meta[0].splicesrc = udp_mmh_splice_port(ref, mmh_recv);
+ udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0], now);
for (i = 0; i < n; ) {
+ flow_sidx_t batchsidx = udp_meta[i].tosidx;
int batchsrc = udp_meta[i].splicesrc;
int batchstart = i;
@@ -807,7 +946,11 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve
udp_meta[i].splicesrc = udp_mmh_splice_port(ref,
&mmh_recv[i]);
- } while (udp_meta[i].splicesrc == batchsrc);
+ udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
+ &udp_meta[i],
+ now);
+ } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx) &&
+ udp_meta[i].splicesrc == batchsrc);
if (batchsrc >= 0)
udp_splice_send(c, batchstart, i - batchstart,
@@ -1201,6 +1344,24 @@ static int udp_port_rebind_outbound(void *arg)
return 0;
}
+/**
+ * udp_flow_timer() - Handler for timed events related to a given flow
+ * @c: Execution context
+ * @uflow: UDP flow
+ * @now: Current timestamp
+ *
+ * Return: true if the flow is ready to free, false otherwise
+ */
+bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow,
+ const struct timespec *now)
+{
+ if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT)
+ return false;
+
+ udp_flow_close(c, uflow);
+ return true;
+}
+
/**
* udp_timer() - Scan activity bitmaps for ports with associated timed events
* @c: Execution context
diff --git a/udp_flow.h b/udp_flow.h
new file mode 100644
index 00000000..18af9ac4
--- /dev/null
+++ b/udp_flow.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ *
+ * UDP flow tracking data structures
+ */
+#ifndef UDP_FLOW_H
+#define UDP_FLOW_H
+
+/**
+ * struct udp - Descriptor for a flow of UDP packets
+ * @f: Generic flow information
+ * @ts: Activity timestamp
+ */
+struct udp_flow {
+ /* Must be first element */
+ struct flow_common f;
+
+ time_t ts;
+};
+
+bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow,
+ const struct timespec *now);
+
+#endif /* UDP_FLOW_H */
--
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ *
+ * UDP flow tracking data structures
+ */
+#ifndef UDP_FLOW_H
+#define UDP_FLOW_H
+
+/**
+ * struct udp - Descriptor for a flow of UDP packets
+ * @f: Generic flow information
+ * @ts: Activity timestamp
+ */
+struct udp_flow {
+ /* Must be first element */
+ struct flow_common f;
+
+ time_t ts;
+};
+
+bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow,
+ const struct timespec *now);
+
+#endif /* UDP_FLOW_H */
--
2.45.2
next prev parent reply other threads:[~2024-07-05 2:07 UTC|newest]
Thread overview: 59+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-07-05 2:06 [PATCH v7 00/27] Unified flow table David Gibson
2024-07-05 2:06 ` [PATCH v7 01/27] flow: Common address information for initiating side David Gibson
2024-07-05 2:06 ` [PATCH v7 02/27] flow: Common address information for target side David Gibson
2024-07-10 21:30 ` Stefano Brivio
2024-07-11 0:19 ` David Gibson
2024-07-05 2:07 ` [PATCH v7 03/27] tcp, flow: Remove redundant information, repack connection structures David Gibson
2024-07-05 2:07 ` [PATCH v7 04/27] tcp: Obtain guest address from flowside David Gibson
2024-07-05 2:07 ` [PATCH v7 05/27] tcp: Manage outbound address via flow table David Gibson
2024-07-05 2:07 ` [PATCH v7 06/27] tcp: Simplify endpoint validation using flowside information David Gibson
2024-07-05 2:07 ` [PATCH v7 07/27] tcp_splice: Eliminate SPLICE_V6 flag David Gibson
2024-07-05 2:07 ` [PATCH v7 08/27] tcp, flow: Replace TCP specific hash function with general flow hash David Gibson
2024-07-05 2:07 ` [PATCH v7 09/27] flow, tcp: Generalise TCP hash table to general flow hash table David Gibson
2024-07-05 2:07 ` [PATCH v7 10/27] tcp: Re-use flow hash for initial sequence number generation David Gibson
2024-07-05 2:07 ` [PATCH v7 11/27] icmp: Remove redundant id field from flow table entry David Gibson
2024-07-05 2:07 ` [PATCH v7 12/27] icmp: Obtain destination addresses from the flowsides David Gibson
2024-07-05 2:07 ` [PATCH v7 13/27] icmp: Look up ping flows using flow hash David Gibson
2024-07-05 2:07 ` [PATCH v7 14/27] icmp: Eliminate icmp_id_map David Gibson
2024-07-05 2:07 ` [PATCH v7 15/27] flow: Helper to create sockets based on flowside David Gibson
2024-07-10 21:32 ` Stefano Brivio
2024-07-11 0:21 ` David Gibson
2024-07-11 0:27 ` David Gibson
2024-07-05 2:07 ` [PATCH v7 16/27] icmp: Manage outbound socket address via flow table David Gibson
2024-07-05 2:07 ` [PATCH v7 17/27] flow, tcp: Flow based NAT and port forwarding for TCP David Gibson
2024-07-05 2:07 ` [PATCH v7 18/27] flow, icmp: Use general flow forwarding rules for ICMP David Gibson
2024-07-05 2:07 ` [PATCH v7 19/27] fwd: Update flow forwarding logic for UDP David Gibson
2024-07-08 21:26 ` Stefano Brivio
2024-07-09 0:19 ` David Gibson
2024-07-05 2:07 ` David Gibson [this message]
2024-07-09 22:32 ` [PATCH v7 20/27] udp: Create flows for datagrams from originating sockets Stefano Brivio
2024-07-09 23:59 ` David Gibson
2024-07-10 21:35 ` Stefano Brivio
2024-07-11 4:26 ` David Gibson
2024-07-11 8:20 ` Stefano Brivio
2024-07-11 22:58 ` David Gibson
2024-07-12 8:21 ` Stefano Brivio
2024-07-15 4:06 ` David Gibson
2024-07-15 16:37 ` Stefano Brivio
2024-07-17 0:49 ` David Gibson
2024-07-05 2:07 ` [PATCH v7 21/27] udp: Handle "spliced" datagrams with per-flow sockets David Gibson
2024-07-09 22:32 ` Stefano Brivio
2024-07-10 0:23 ` David Gibson
2024-07-10 17:13 ` Stefano Brivio
2024-07-11 1:30 ` David Gibson
2024-07-11 8:23 ` Stefano Brivio
2024-07-11 2:48 ` David Gibson
2024-07-12 13:34 ` Stefano Brivio
2024-07-15 4:32 ` David Gibson
2024-07-05 2:07 ` [PATCH v7 22/27] udp: Remove obsolete splice tracking David Gibson
2024-07-10 21:36 ` Stefano Brivio
2024-07-11 0:43 ` David Gibson
2024-07-05 2:07 ` [PATCH v7 23/27] udp: Find or create flows for datagrams from tap interface David Gibson
2024-07-10 21:36 ` Stefano Brivio
2024-07-11 0:45 ` David Gibson
2024-07-05 2:07 ` [PATCH v7 24/27] udp: Direct datagrams from host to guest via flow table David Gibson
2024-07-10 21:37 ` Stefano Brivio
2024-07-11 0:46 ` David Gibson
2024-07-05 2:07 ` [PATCH v7 25/27] udp: Remove obsolete socket tracking David Gibson
2024-07-05 2:07 ` [PATCH v7 26/27] udp: Remove rdelta port forwarding maps David Gibson
2024-07-05 2:07 ` [PATCH v7 27/27] udp: Rename UDP listening sockets David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240705020724.3447719-21-david@gibson.dropbear.id.au \
--to=david@gibson.dropbear.id.au \
--cc=jmaloy@redhat.com \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).