public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: Stefano Brivio <sbrivio@redhat.com>, passt-dev@passt.top
Cc: jmaloy@redhat.com, David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v8 23/27] udp: Find or create flows for datagrams from tap interface
Date: Thu, 18 Jul 2024 15:26:49 +1000	[thread overview]
Message-ID: <20240718052653.3241585-24-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20240718052653.3241585-1-david@gibson.dropbear.id.au>

Currently we create flows for datagrams from socket interfaces, and use
them to direct "spliced" (socket to socket) datagrams.  We don't yet
match datagrams from the tap interface to existing flows, nor create new
flows for them.  Add that functionality, matching datagrams from tap to
existing flows when they exist, or creating new ones.

As with spliced flows, when creating a new flow from tap to socket, we
create a new connected socket to receive reply datagrams attached to that
flow specifically. We extend udp_flow_sock_handler() to handle reply
packets bound for tap rather than another socket.

For non-obvious reasons (perhaps increased stack usage?), this caused
a failure for me when running under valgrind, because valgrind invoked
rt_sigreturn which is not in our seccomp filter.  Since we already
allow rt_sigaction and others in the valgrind target, it seems
reasonable to add rt_sigreturn as well.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 Makefile |   2 +-
 udp.c    | 211 +++++++++++++++++++++++++------------------------------
 udp.h    |   4 +-
 3 files changed, 100 insertions(+), 117 deletions(-)

diff --git a/Makefile b/Makefile
index 92cbd5a6..bd504d23 100644
--- a/Makefile
+++ b/Makefile
@@ -128,7 +128,7 @@ qrap: $(QRAP_SRCS) passt.h
 	$(CC) $(FLAGS) $(CFLAGS) $(CPPFLAGS) $(QRAP_SRCS) -o qrap $(LDFLAGS)
 
 valgrind: EXTRA_SYSCALLS += rt_sigprocmask rt_sigtimedwait rt_sigaction	\
-			    getpid gettid kill clock_gettime mmap	\
+			    rt_sigreturn getpid gettid kill clock_gettime mmap \
 			    munmap open unlink gettimeofday futex
 valgrind: FLAGS += -g -DVALGRIND
 valgrind: all
diff --git a/udp.c b/udp.c
index b459b109..2407ca86 100644
--- a/udp.c
+++ b/udp.c
@@ -116,6 +116,7 @@
 #include <sys/uio.h>
 #include <time.h>
 #include <fcntl.h>
+#include <arpa/inet.h>
 #include <linux/errqueue.h>
 
 #include "checksum.h"
@@ -389,6 +390,8 @@ static void udp_flow_close(const struct ctx *c, struct udp_flow *uflow)
 		uflow->s[TGTSIDE] = -1;
 	}
 	flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE));
+	if (!pif_is_socket(uflow->f.pif[TGTSIDE]))
+		flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE));
 }
 
 /**
@@ -483,6 +486,13 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 	}
 
 	flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE));
+
+	/* If the target side is a socket, it will be a reply socket that knows
+	 * its own flowside.  But if it's tap, then we need to look it up by
+	 * hash.
+	 */
+	if (!pif_is_socket(tgtpif))
+		flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE));
 	FLOW_ACTIVATE(uflow);
 
 	return FLOW_SIDX(uflow, TGTSIDE);
@@ -907,10 +917,12 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 {
 	const struct flowside *fromside = flowside_at_sidx(ref.flowside);
 	flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+	const struct flowside *toside = flowside_at_sidx(tosidx);
 	struct udp_flow *uflow = udp_at_sidx(ref.flowside);
 	int from_s = uflow->s[ref.flowside.sidei];
 	bool v6 = !inany_v4(&fromside->eaddr);
 	struct mmsghdr *mmh_recv = v6 ? udp6_mh_recv : udp4_mh_recv;
+	uint8_t topif = pif_at_sidx(tosidx);
 	int n, i;
 
 	ASSERT(!c->no_udp && uflow);
@@ -921,10 +933,64 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 	flow_trace(uflow, "Received %d datagrams on reply socket", n);
 	uflow->ts = now->tv_sec;
 
-	for (i = 0; i < n; i++)
-		udp_splice_prepare(mmh_recv, i);
+	for (i = 0; i < n; i++) {
+		if (pif_is_socket(topif))
+			udp_splice_prepare(mmh_recv, i);
+		else
+			udp_tap_prepare(c, mmh_recv, i, toside->eport, v6, now);
+	}
 
-	udp_splice_send(c, 0, n, tosidx);
+	if (pif_is_socket(topif))
+		udp_splice_send(c, 0, n, tosidx);
+	else
+		tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
+}
+
+/**
+ * udp_flow_from_tap() - Find or create UDP flow for tap packets
+ * @c:		Execution context
+ * @pif:	pif on which the packet is arriving
+ * @af:		Address family, AF_INET or AF_INET6
+ * @saddr:	Source address on guest side
+ * @daddr:	Destination address guest side
+ * @srcport:	Source port on guest side
+ * @dstport:	Destination port on guest side
+ *
+ * Return: sidx for the destination side of the flow for this packet, or
+ *         FLOW_SIDX_NONE if we couldn't find or create a flow.
+ */
+static flow_sidx_t udp_flow_from_tap(const struct ctx *c,
+				     uint8_t pif, sa_family_t af,
+				     const void *saddr, const void *daddr,
+				     in_port_t srcport, in_port_t dstport,
+				     const struct timespec *now)
+{
+	struct udp_flow *uflow;
+	union flow *flow;
+	flow_sidx_t sidx;
+
+	ASSERT(pif == PIF_TAP);
+
+	sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr,
+			      srcport, dstport);
+	if ((uflow = udp_at_sidx(sidx))) {
+		uflow->ts = now->tv_sec;
+		return flow_sidx_opposite(sidx);
+	}
+
+	if (!(flow = flow_alloc())) {
+		char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN];
+
+		debug("Couldn't allocate flow for UDP datagram from %s %s:%hu -> %s:%hu",
+		      pif_name(pif),
+		      inet_ntop(af, saddr, sstr, sizeof(sstr)), srcport,
+		      inet_ntop(af, daddr, dstr, sizeof(dstr)), dstport);
+		return FLOW_SIDX_NONE;
+	}
+
+	flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport);
+
+	return udp_flow_new(c, flow, -1, now);
 }
 
 /**
@@ -942,23 +1008,22 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
  *
  * #syscalls sendmmsg
  */
-int udp_tap_handler(struct ctx *c, uint8_t pif,
+int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now)
 {
+	const struct flowside *toside;
 	struct mmsghdr mm[UIO_MAXIOV];
+	union sockaddr_inany to_sa;
 	struct iovec m[UIO_MAXIOV];
-	struct sockaddr_in6 s_in6;
-	struct sockaddr_in s_in;
 	const struct udphdr *uh;
-	struct sockaddr *sa;
+	struct udp_flow *uflow;
 	int i, s, count = 0;
+	flow_sidx_t tosidx;
 	in_port_t src, dst;
+	uint8_t topif;
 	socklen_t sl;
 
-	(void)saddr;
-	(void)pif;
-
 	ASSERT(!c->no_udp);
 
 	uh = packet_get(p, idx, 0, sizeof(*uh), NULL);
@@ -969,116 +1034,34 @@ int udp_tap_handler(struct ctx *c, uint8_t pif,
 	 * and destination, so we can just take those from the first message.
 	 */
 	src = ntohs(uh->source);
-	src += c->udp.fwd_in.rdelta[src];
 	dst = ntohs(uh->dest);
 
-	if (af == AF_INET) {
-		s_in = (struct sockaddr_in) {
-			.sin_family = AF_INET,
-			.sin_port = uh->dest,
-			.sin_addr = *(struct in_addr *)daddr,
-		};
-
-		sa = (struct sockaddr *)&s_in;
-		sl = sizeof(s_in);
-
-		if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.dns_match) &&
-		    ntohs(s_in.sin_port) == 53) {
-			s_in.sin_addr = c->ip4.dns_host;
-			udp_tap_map[V4][src].ts = now->tv_sec;
-			udp_tap_map[V4][src].flags |= PORT_DNS_FWD;
-			bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
-		} else if (IN4_ARE_ADDR_EQUAL(&s_in.sin_addr, &c->ip4.gw) &&
-			   !c->no_map_gw) {
-			if (!(udp_tap_map[V4][dst].flags & PORT_LOCAL) ||
-			    (udp_tap_map[V4][dst].flags & PORT_LOOPBACK))
-				s_in.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
-			else
-				s_in.sin_addr = c->ip4.addr_seen;
-		}
-
-		debug("UDP from tap src=%hu dst=%hu, s=%d",
-		      src, dst, udp_tap_map[V4][src].sock);
-		if ((s = udp_tap_map[V4][src].sock) < 0) {
-			struct in_addr bind_addr = IN4ADDR_ANY_INIT;
-			union udp_epoll_ref uref = {
-				.port = src,
-				.pif = PIF_HOST,
-			};
-			const char *bind_if = NULL;
-
-			if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr))
-				bind_if = c->ip4.ifname_out;
-
-			if (!IN4_IS_ADDR_LOOPBACK(&s_in.sin_addr))
-				bind_addr = c->ip4.addr_out;
-
-			s = sock_l4(c, AF_INET, EPOLL_TYPE_UDP, &bind_addr,
-				    bind_if, src, uref.u32);
-			if (s < 0)
-				return p->count - idx;
-
-			udp_tap_map[V4][src].sock = s;
-			bitmap_set(udp_act[V4][UDP_ACT_TAP], src);
-		}
-
-		udp_tap_map[V4][src].ts = now->tv_sec;
-	} else {
-		s_in6 = (struct sockaddr_in6) {
-			.sin6_family = AF_INET6,
-			.sin6_port = uh->dest,
-			.sin6_addr = *(struct in6_addr *)daddr,
-		};
-		const struct in6_addr *bind_addr = &in6addr_any;
-
-		sa = (struct sockaddr *)&s_in6;
-		sl = sizeof(s_in6);
-
-		if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.dns_match) &&
-		    ntohs(s_in6.sin6_port) == 53) {
-			s_in6.sin6_addr = c->ip6.dns_host;
-			udp_tap_map[V6][src].ts = now->tv_sec;
-			udp_tap_map[V6][src].flags |= PORT_DNS_FWD;
-			bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
-		} else if (IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw) &&
-			   !c->no_map_gw) {
-			if (!(udp_tap_map[V6][dst].flags & PORT_LOCAL) ||
-			    (udp_tap_map[V6][dst].flags & PORT_LOOPBACK))
-				s_in6.sin6_addr = in6addr_loopback;
-			else if (udp_tap_map[V6][dst].flags & PORT_GUA)
-				s_in6.sin6_addr = c->ip6.addr;
-			else
-				s_in6.sin6_addr = c->ip6.addr_seen;
-		} else if (IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr)) {
-			bind_addr = &c->ip6.addr_ll;
-		}
-
-		if ((s = udp_tap_map[V6][src].sock) < 0) {
-			union udp_epoll_ref uref = {
-				.v6 = 1,
-				.port = src,
-				.pif = PIF_HOST,
-			};
-			const char *bind_if = NULL;
+	tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst, now);
+	if (!(uflow = udp_at_sidx(tosidx))) {
+		char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN];
 
-			if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr))
-				bind_if = c->ip6.ifname_out;
+		debug("Dropping datagram with no flow %s %s:%hu -> %s:%hu",
+		      pif_name(pif),
+		      inet_ntop(af, saddr, sstr, sizeof(sstr)), src,
+		      inet_ntop(af, daddr, dstr, sizeof(dstr)), dst);
+		return 1;
+	}
 
-			if (!IN6_IS_ADDR_LOOPBACK(&s_in6.sin6_addr) &&
-			    !IN6_IS_ADDR_LINKLOCAL(&s_in6.sin6_addr))
-				bind_addr = &c->ip6.addr_out;
+	topif = pif_at_sidx(tosidx);
+	if (topif != PIF_HOST) {
+		flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
+		uint8_t frompif = pif_at_sidx(fromsidx);
 
-			s = sock_l4(c, AF_INET6, EPOLL_TYPE_UDP, bind_addr,
-				    bind_if, src, uref.u32);
-			if (s < 0)
-				return p->count - idx;
+		flow_err(uflow, "No support for forwarding UDP from %s to %s",
+			 pif_name(frompif), pif_name(topif));
+		return 1;
+	}
+	toside = flowside_at_sidx(tosidx);
 
-			udp_tap_map[V6][src].sock = s;
-			bitmap_set(udp_act[V6][UDP_ACT_TAP], src);
-		}
+	s = udp_at_sidx(tosidx)->s[tosidx.sidei];
+	ASSERT(s >= 0);
 
-		udp_tap_map[V6][src].ts = now->tv_sec;
-	}
+	pif_sockaddr(c, &to_sa, &sl, topif, &toside->eaddr, toside->eport);
 
 	for (i = 0; i < (int)p->count - idx; i++) {
 		struct udphdr *uh_send;
@@ -1088,7 +1071,7 @@ int udp_tap_handler(struct ctx *c, uint8_t pif,
 		if (!uh_send)
 			return p->count - idx;
 
-		mm[i].msg_hdr.msg_name = sa;
+		mm[i].msg_hdr.msg_name = &to_sa;
 		mm[i].msg_hdr.msg_namelen = sl;
 
 		if (len) {
diff --git a/udp.h b/udp.h
index e133f1e7..ceaa8c54 100644
--- a/udp.h
+++ b/udp.h
@@ -13,8 +13,8 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref,
 			  uint32_t events, const struct timespec *now);
 void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now);
-int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
-		    const void *saddr, const void *daddr,
+int udp_tap_handler(const struct ctx *c, uint8_t pif,
+		    sa_family_t af, const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now);
 int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		  const void *addr, const char *ifname, in_port_t port);
-- 
@@ -13,8 +13,8 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref,
 			  uint32_t events, const struct timespec *now);
 void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now);
-int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
-		    const void *saddr, const void *daddr,
+int udp_tap_handler(const struct ctx *c, uint8_t pif,
+		    sa_family_t af, const void *saddr, const void *daddr,
 		    const struct pool *p, int idx, const struct timespec *now);
 int udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		  const void *addr, const char *ifname, in_port_t port);
-- 
2.45.2


  parent reply	other threads:[~2024-07-18  5:27 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-07-18  5:26 [PATCH v8 00/27] Unified flow table David Gibson
2024-07-18  5:26 ` [PATCH v8 01/27] flow: Common address information for initiating side David Gibson
2024-07-18  5:26 ` [PATCH v8 02/27] flow: Common address information for target side David Gibson
2024-07-18  5:26 ` [PATCH v8 03/27] tcp, flow: Remove redundant information, repack connection structures David Gibson
2024-07-18  5:26 ` [PATCH v8 04/27] tcp: Obtain guest address from flowside David Gibson
2024-07-18  5:26 ` [PATCH v8 05/27] tcp: Manage outbound address via flow table David Gibson
2024-07-18  5:26 ` [PATCH v8 06/27] tcp: Simplify endpoint validation using flowside information David Gibson
2024-07-18  5:26 ` [PATCH v8 07/27] tcp_splice: Eliminate SPLICE_V6 flag David Gibson
2024-07-18  5:26 ` [PATCH v8 08/27] tcp, flow: Replace TCP specific hash function with general flow hash David Gibson
2024-07-18  5:26 ` [PATCH v8 09/27] flow, tcp: Generalise TCP hash table to general flow hash table David Gibson
2024-07-18  5:26 ` [PATCH v8 10/27] tcp: Re-use flow hash for initial sequence number generation David Gibson
2024-07-18  5:26 ` [PATCH v8 11/27] icmp: Remove redundant id field from flow table entry David Gibson
2024-07-18  5:26 ` [PATCH v8 12/27] icmp: Obtain destination addresses from the flowsides David Gibson
2024-07-18  5:26 ` [PATCH v8 13/27] icmp: Look up ping flows using flow hash David Gibson
2024-07-18  5:26 ` [PATCH v8 14/27] icmp: Eliminate icmp_id_map David Gibson
2024-07-18  5:26 ` [PATCH v8 15/27] flow: Helper to create sockets based on flowside David Gibson
2024-07-18  5:26 ` [PATCH v8 16/27] icmp: Manage outbound socket address via flow table David Gibson
2024-07-18  5:26 ` [PATCH v8 17/27] flow, tcp: Flow based NAT and port forwarding for TCP David Gibson
2024-07-18  5:26 ` [PATCH v8 18/27] flow, icmp: Use general flow forwarding rules for ICMP David Gibson
2024-07-18  5:26 ` [PATCH v8 19/27] fwd: Update flow forwarding logic for UDP David Gibson
2024-07-18  5:26 ` [PATCH v8 20/27] udp: Create flows for datagrams from originating sockets David Gibson
2024-07-18  5:26 ` [PATCH v8 21/27] udp: Handle "spliced" datagrams with per-flow sockets David Gibson
2024-07-18  5:26 ` [PATCH v8 22/27] udp: Remove obsolete splice tracking David Gibson
2024-07-18  5:26 ` David Gibson [this message]
2024-07-18  5:26 ` [PATCH v8 24/27] udp: Direct datagrams from host to guest via flow table David Gibson
2024-07-18  5:26 ` [PATCH v8 25/27] udp: Remove obsolete socket tracking David Gibson
2024-07-18  5:26 ` [PATCH v8 26/27] udp: Remove rdelta port forwarding maps David Gibson
2024-07-18  5:26 ` [PATCH v8 27/27] udp: Rename UDP listening sockets David Gibson
2024-07-19 19:20 ` [PATCH v8 00/27] Unified flow table Stefano Brivio
2024-07-20  3:37   ` David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240718052653.3241585-24-david@gibson.dropbear.id.au \
    --to=david@gibson.dropbear.id.au \
    --cc=jmaloy@redhat.com \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).