public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: Stefano Brivio <sbrivio@redhat.com>, passt-dev@passt.top
Cc: jmaloy@redhat.com, David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v8 20/27] udp: Create flows for datagrams from originating sockets
Date: Thu, 18 Jul 2024 15:26:46 +1000	[thread overview]
Message-ID: <20240718052653.3241585-21-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20240718052653.3241585-1-david@gibson.dropbear.id.au>

This implements the first steps of tracking UDP packets with the flow table
rather than its own (buggy) set of port maps.  Specifically we create flow
table entries for datagrams received from a socket (PIF_HOST or
PIF_SPLICE).

When splitting datagrams from sockets into batches, we group by the flow
as well as splicesrc.  This may result in smaller batches, but makes things
easier down the line.  We can re-optimise this later if necessary.  For now
we don't do anything else with the flow, not even match reply packets to
the same flow.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 Makefile     |   2 +-
 flow.c       |  32 ++++++++++
 flow.h       |   4 ++
 flow_table.h |  15 +++++
 udp.c        | 169 +++++++++++++++++++++++++++++++++++++++++++++++++--
 udp_flow.h   |  25 ++++++++
 6 files changed, 242 insertions(+), 5 deletions(-)
 create mode 100644 udp_flow.h

diff --git a/Makefile b/Makefile
index 09fc461d..92cbd5a6 100644
--- a/Makefile
+++ b/Makefile
@@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
 	flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
 	lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
 	siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
-	udp.h util.h
+	udp.h udp_flow.h util.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
 C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
diff --git a/flow.c b/flow.c
index 27340df9..4e337d42 100644
--- a/flow.c
+++ b/flow.c
@@ -37,6 +37,7 @@ const char *flow_type_str[] = {
 	[FLOW_TCP_SPLICE]	= "TCP connection (spliced)",
 	[FLOW_PING4]		= "ICMP ping sequence",
 	[FLOW_PING6]		= "ICMPv6 ping sequence",
+	[FLOW_UDP]		= "UDP flow",
 };
 static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
 	      "flow_type_str[] doesn't match enum flow_type");
@@ -46,6 +47,7 @@ const uint8_t flow_proto[] = {
 	[FLOW_TCP_SPLICE]	= IPPROTO_TCP,
 	[FLOW_PING4]		= IPPROTO_ICMP,
 	[FLOW_PING6]		= IPPROTO_ICMPV6,
+	[FLOW_UDP]		= IPPROTO_UDP,
 };
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
@@ -701,6 +703,32 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
 	return flowside_lookup(c, proto, pif, &side);
 }
 
+/**
+ * flow_lookup_sa() - Look up a flow given an endpoint socket address
+ * @c:		Execution context
+ * @proto:	Protocol of the flow (IP L4 protocol number)
+ * @pif:	Interface of the flow
+ * @esa:	Socket address of the endpoint
+ * @fport:	Forwarding port number
+ *
+ * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found
+ */
+flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
+			   const void *esa, in_port_t fport)
+{
+	struct flowside side = {
+		.fport = fport,
+	};
+
+	inany_from_sockaddr(&side.eaddr, &side.eport, esa);
+	if (inany_v4(&side.eaddr))
+		side.faddr = inany_any4;
+	else
+		side.faddr = inany_any6;
+
+	return flowside_lookup(c, proto, pif, &side);
+}
+
 /**
  * flow_defer_handler() - Handler for per-flow deferred and timed tasks
  * @c:		Execution context
@@ -780,6 +808,10 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 			if (timer)
 				closed = icmp_ping_timer(c, &flow->ping, now);
 			break;
+		case FLOW_UDP:
+			if (timer)
+				closed = udp_flow_timer(c, &flow->udp, now);
+			break;
 		default:
 			/* Assume other flow types don't need any handling */
 			;
diff --git a/flow.h b/flow.h
index bf6b8459..7866477b 100644
--- a/flow.h
+++ b/flow.h
@@ -115,6 +115,8 @@ enum flow_type {
 	FLOW_PING4,
 	/* ICMPv6 echo requests from guest to host and matching replies back */
 	FLOW_PING6,
+	/* UDP pseudo-connection */
+	FLOW_UDP,
 
 	FLOW_NUM_TYPES,
 };
@@ -238,6 +240,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c,
 			   uint8_t proto, uint8_t pif, sa_family_t af,
 			   const void *eaddr, const void *faddr,
 			   in_port_t eport, in_port_t fport);
+flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif,
+			   const void *esa, in_port_t fport);
 
 union flow;
 
diff --git a/flow_table.h b/flow_table.h
index 9d912c83..df253be4 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -9,6 +9,7 @@
 
 #include "tcp_conn.h"
 #include "icmp_flow.h"
+#include "udp_flow.h"
 
 /**
  * struct flow_free_cluster - Information about a cluster of free entries
@@ -35,6 +36,7 @@ union flow {
 	struct tcp_tap_conn tcp;
 	struct tcp_splice_conn tcp_splice;
 	struct icmp_ping_flow ping;
+	struct udp_flow udp;
 };
 
 /* Global Flow Table */
@@ -98,6 +100,19 @@ static inline uint8_t pif_at_sidx(flow_sidx_t sidx)
 	return flow->f.pif[sidx.sidei];
 }
 
+/** flow_sidx_opposite() - Get the other side of the same flow
+ * @sidx:	Flow & side index
+ *
+ * Return: sidx for the other side of the same flow as @sidx
+ */
+static inline flow_sidx_t flow_sidx_opposite(flow_sidx_t sidx)
+{
+	if (!flow_sidx_valid(sidx))
+		return FLOW_SIDX_NONE;
+
+	return (flow_sidx_t){.flowi = sidx.flowi, .sidei = !sidx.sidei};
+}
+
 /** flow_sidx() - Index of one side of a flow from common structure
  * @f:		Common flow fields pointer
  * @sidei:	Which side to refer to (0 or 1)
diff --git a/udp.c b/udp.c
index 150f970a..fdbe3968 100644
--- a/udp.c
+++ b/udp.c
@@ -15,6 +15,30 @@
 /**
  * DOC: Theory of Operation
  *
+ * UDP Flows
+ * =========
+ *
+ * UDP doesn't have true connections, but many protocols use a connection-like
+ * format.  The flow is initiated by a client sending a datagram from a port of
+ * its choosing (usually ephemeral) to a specific port (usually well known) on a
+ * server.  Both client and server address must be unicast.  The server sends
+ * replies using the same addresses & ports with src/dest swapped.
+ *
+ * We track pseudo-connections of this type as flow table entries of type
+ * FLOW_UDP.  We store the time of the last traffic on the flow in uflow->ts,
+ * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds.
+ *
+ * NOTE: This won't handle multicast protocols, or some protocols with different
+ * port usage.  We'll need specific logic if we want to handle those.
+ *
+ * "Listening" sockets
+ * ===================
+ *
+ * UDP doesn't use listen(), but we consider long term sockets which are allowed
+ * to create new flows "listening" by analogy with TCP.
+ *
+ * Port tracking
+ * =============
  *
  * For UDP, a reduced version of port-based connection tracking is implemented
  * with two purposes:
@@ -122,6 +146,7 @@
 #include "tap.h"
 #include "pcap.h"
 #include "log.h"
+#include "flow_table.h"
 
 #define UDP_CONN_TIMEOUT	180 /* s, timeout for ephemeral or local bind */
 #define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
@@ -200,6 +225,7 @@ static struct ethhdr udp6_eth_hdr;
  * @taph:	Tap backend specific header
  * @s_in:	Source socket address, filled in by recvmmsg()
  * @splicesrc:	Source port for splicing, or -1 if not spliceable
+ * @tosidx:	sidx for the destination side of this datagram's flow
  */
 static struct udp_meta_t {
 	struct ipv6hdr ip6h;
@@ -208,6 +234,7 @@ static struct udp_meta_t {
 
 	union sockaddr_inany s_in;
 	int splicesrc;
+	flow_sidx_t tosidx;
 }
 #ifdef __AVX2__
 __attribute__ ((aligned(32)))
@@ -491,6 +518,115 @@ static int udp_mmh_splice_port(union epoll_ref ref, const struct mmsghdr *mmh)
 	return -1;
 }
 
+/**
+ * udp_at_sidx() - Get UDP specific flow at given sidx
+ * @sidx:    Flow and side to retrieve
+ *
+ * Return: UDP specific flow at @sidx, or NULL of @sidx is invalid.  Asserts if
+ *         the flow at @sidx is not FLOW_UDP.
+ */
+struct udp_flow *udp_at_sidx(flow_sidx_t sidx)
+{
+	union flow *flow = flow_at_sidx(sidx);
+
+	if (!flow)
+		return NULL;
+
+	ASSERT(flow->f.type == FLOW_UDP);
+	return &flow->udp;
+}
+
+/*
+ * udp_flow_close() - Close and clean up UDP flow
+ * @c:		Execution context
+ * @uflow:	UDP flow
+ */
+static void udp_flow_close(const struct ctx *c, const struct udp_flow *uflow)
+{
+	flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
+}
+
+/**
+ * udp_flow_new() - Common setup for a new UDP flow
+ * @c:		Execution context
+ * @flow:	Initiated flow
+ * @now:	Timestamp
+ *
+ * Return: UDP specific flow, if successful, NULL on failure
+ */
+static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
+				const struct timespec *now)
+{
+	const struct flowside *ini = &flow->f.side[INISIDE];
+	struct udp_flow *uflow = NULL;
+
+	if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) {
+		flow_trace(flow, "Invalid endpoint to initiate UDP flow");
+		goto cancel;
+	}
+
+	if (!flow_target(c, flow, IPPROTO_UDP))
+		goto cancel;
+
+	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
+	uflow->ts = now->tv_sec;
+
+	flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
+	FLOW_ACTIVATE(uflow);
+
+	return FLOW_SIDX(uflow, TGTSIDE);
+
+cancel:
+	if (uflow)
+		udp_flow_close(c, uflow);
+	flow_alloc_cancel(flow);
+	return FLOW_SIDX_NONE;
+
+}
+
+/**
+ * udp_flow_from_sock() - Find or create UDP flow for "listening" socket
+ * @c:		Execution context
+ * @ref:	epoll reference of the receiving socket
+ * @meta:	Metadata buffer for the datagram
+ * @now:	Timestamp
+ *
+ * Return: sidx for the destination side of the flow for this packet, or
+ *         FLOW_SIDX_NONE if we couldn't find or create a flow.
+ */
+static flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref,
+				      struct udp_meta_t *meta,
+				      const struct timespec *now)
+{
+	struct udp_flow *uflow;
+	union flow *flow;
+	flow_sidx_t sidx;
+
+	ASSERT(ref.type == EPOLL_TYPE_UDP);
+
+	/* FIXME: Match reply packets to their flow as well */
+	if (!ref.udp.orig)
+		return FLOW_SIDX_NONE;
+
+	sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif, &meta->s_in, ref.udp.port);
+	if ((uflow = udp_at_sidx(sidx))) {
+		uflow->ts = now->tv_sec;
+		return flow_sidx_opposite(sidx);
+	}
+
+	if (!(flow = flow_alloc())) {
+		char sastr[SOCKADDR_STRLEN];
+
+		debug("Couldn't allocate flow for UDP datagram from %s %s",
+		      pif_name(ref.udp.pif),
+		      sockaddr_ntop(&meta->s_in, sastr, sizeof(sastr)));
+		return FLOW_SIDX_NONE;
+	}
+
+	flow_initiate_sa(flow, ref.udp.pif, &meta->s_in, ref.udp.port);
+	return udp_flow_new(c, flow, now);
+}
+
 /**
  * udp_splice_prepare() - Prepare one datagram for splicing
  * @mmh:	Receiving mmsghdr array
@@ -848,12 +984,15 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve
 		dstport += c->udp.fwd_in.f.delta[dstport];
 
 	/* We divide datagrams into batches based on how we need to send them,
-	 * determined by udp_meta[i].splicesrc.  To avoid either two passes
-	 * through the array, or recalculating splicesrc for a single entry, we
-	 * have to populate it one entry *ahead* of the loop counter.
+	 * determined by udp_meta[i].splicesrc and udp_meta[i].tosidx.  To avoid
+	 * either two passes through the array, or recalculating splicesrc and
+	 * tosidxfor a single entry, we have to populate it one entry *ahead* of
+	 * the loop counter.
 	 */
 	udp_meta[0].splicesrc = udp_mmh_splice_port(ref, mmh_recv);
+	udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0], now);
 	for (i = 0; i < n; ) {
+		flow_sidx_t batchsidx = udp_meta[i].tosidx;
 		int batchsrc = udp_meta[i].splicesrc;
 		int batchstart = i;
 
@@ -870,7 +1009,11 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve
 
 			udp_meta[i].splicesrc = udp_mmh_splice_port(ref,
 								    &mmh_recv[i]);
-		} while (udp_meta[i].splicesrc == batchsrc);
+			udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
+								&udp_meta[i],
+								now);
+		} while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx) &&
+			 udp_meta[i].splicesrc == batchsrc);
 
 		if (batchsrc >= 0) {
 			udp_splice_send(c, batchstart, i - batchstart,
@@ -1268,6 +1411,24 @@ static int udp_port_rebind_outbound(void *arg)
 	return 0;
 }
 
+/**
+ * udp_flow_timer() - Handler for timed events related to a given flow
+ * @c:		Execution context
+ * @uflow:	UDP flow
+ * @now:	Current timestamp
+ *
+ * Return: true if the flow is ready to free, false otherwise
+ */
+bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow,
+             const struct timespec *now)
+{
+	if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT)
+		return false;
+
+	udp_flow_close(c, uflow);
+	return true;
+}
+
 /**
  * udp_timer() - Scan activity bitmaps for ports with associated timed events
  * @c:		Execution context
diff --git a/udp_flow.h b/udp_flow.h
new file mode 100644
index 00000000..18af9ac4
--- /dev/null
+++ b/udp_flow.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ *
+ * UDP flow tracking data structures
+ */
+#ifndef UDP_FLOW_H
+#define UDP_FLOW_H
+
+/**
+ * struct udp - Descriptor for a flow of UDP packets
+ * @f:		Generic flow information
+ * @ts:		Activity timestamp
+ */
+struct udp_flow {
+	/* Must be first element */
+	struct flow_common f;
+
+	time_t ts;
+};
+
+bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow,
+		    const struct timespec *now);
+
+#endif /* UDP_FLOW_H */
-- 
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ *
+ * UDP flow tracking data structures
+ */
+#ifndef UDP_FLOW_H
+#define UDP_FLOW_H
+
+/**
+ * struct udp - Descriptor for a flow of UDP packets
+ * @f:		Generic flow information
+ * @ts:		Activity timestamp
+ */
+struct udp_flow {
+	/* Must be first element */
+	struct flow_common f;
+
+	time_t ts;
+};
+
+bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow,
+		    const struct timespec *now);
+
+#endif /* UDP_FLOW_H */
-- 
2.45.2


  parent reply	other threads:[~2024-07-18  5:27 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-18  5:26 [PATCH v8 00/27] Unified flow table David Gibson
2024-07-18  5:26 ` [PATCH v8 01/27] flow: Common address information for initiating side David Gibson
2024-07-18  5:26 ` [PATCH v8 02/27] flow: Common address information for target side David Gibson
2024-07-18  5:26 ` [PATCH v8 03/27] tcp, flow: Remove redundant information, repack connection structures David Gibson
2024-07-18  5:26 ` [PATCH v8 04/27] tcp: Obtain guest address from flowside David Gibson
2024-07-18  5:26 ` [PATCH v8 05/27] tcp: Manage outbound address via flow table David Gibson
2024-07-18  5:26 ` [PATCH v8 06/27] tcp: Simplify endpoint validation using flowside information David Gibson
2024-07-18  5:26 ` [PATCH v8 07/27] tcp_splice: Eliminate SPLICE_V6 flag David Gibson
2024-07-18  5:26 ` [PATCH v8 08/27] tcp, flow: Replace TCP specific hash function with general flow hash David Gibson
2024-07-18  5:26 ` [PATCH v8 09/27] flow, tcp: Generalise TCP hash table to general flow hash table David Gibson
2024-07-18  5:26 ` [PATCH v8 10/27] tcp: Re-use flow hash for initial sequence number generation David Gibson
2024-07-18  5:26 ` [PATCH v8 11/27] icmp: Remove redundant id field from flow table entry David Gibson
2024-07-18  5:26 ` [PATCH v8 12/27] icmp: Obtain destination addresses from the flowsides David Gibson
2024-07-18  5:26 ` [PATCH v8 13/27] icmp: Look up ping flows using flow hash David Gibson
2024-07-18  5:26 ` [PATCH v8 14/27] icmp: Eliminate icmp_id_map David Gibson
2024-07-18  5:26 ` [PATCH v8 15/27] flow: Helper to create sockets based on flowside David Gibson
2024-07-18  5:26 ` [PATCH v8 16/27] icmp: Manage outbound socket address via flow table David Gibson
2024-07-18  5:26 ` [PATCH v8 17/27] flow, tcp: Flow based NAT and port forwarding for TCP David Gibson
2024-07-18  5:26 ` [PATCH v8 18/27] flow, icmp: Use general flow forwarding rules for ICMP David Gibson
2024-07-18  5:26 ` [PATCH v8 19/27] fwd: Update flow forwarding logic for UDP David Gibson
2024-07-18  5:26 ` David Gibson [this message]
2024-07-18  5:26 ` [PATCH v8 21/27] udp: Handle "spliced" datagrams with per-flow sockets David Gibson
2024-07-18  5:26 ` [PATCH v8 22/27] udp: Remove obsolete splice tracking David Gibson
2024-07-18  5:26 ` [PATCH v8 23/27] udp: Find or create flows for datagrams from tap interface David Gibson
2024-07-18  5:26 ` [PATCH v8 24/27] udp: Direct datagrams from host to guest via flow table David Gibson
2024-07-18  5:26 ` [PATCH v8 25/27] udp: Remove obsolete socket tracking David Gibson
2024-07-18  5:26 ` [PATCH v8 26/27] udp: Remove rdelta port forwarding maps David Gibson
2024-07-18  5:26 ` [PATCH v8 27/27] udp: Rename UDP listening sockets David Gibson
2024-07-19 19:20 ` [PATCH v8 00/27] Unified flow table Stefano Brivio
2024-07-20  3:37   ` David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240718052653.3241585-21-david@gibson.dropbear.id.au \
    --to=david@gibson.dropbear.id.au \
    --cc=jmaloy@redhat.com \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).