public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: Stefano Brivio <sbrivio@redhat.com>, passt-dev@passt.top
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH 04/12] udp: Don't bother to batch datagrams from "listening" socket
Date: Fri,  4 Apr 2025 21:15:34 +1100	[thread overview]
Message-ID: <20250404101542.3729316-5-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20250404101542.3729316-1-david@gibson.dropbear.id.au>

A "listening" UDP socket can receive datagrams from multiple flows.  So,
we currently have some quite subtle and complex code in
udp_buf_listen_sock_data() to group contiguously received packets for the
same flow into batches for forwarding.

However, since we are now always using flow specific connect()ed sockets
once a flow is established, handling of datagrams on listening sockets is
essentially a slow path.  Given that, it's not worth the complexity.
Substantially simplify the code by using an approach more like vhost-user,
and "peeking" at the address of the next datagram, one at a time to
determine the correct flow before we actually receive the data,

This removes all meaningful use of the s_in and tosidx fields in
udp_meta_t, so they too can be removed, along with setting of msg_name and
msg_namelen in the msghdr arrays which referenced them.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 81 ++++++++++++++++-------------------------------------------
 1 file changed, 22 insertions(+), 59 deletions(-)

diff --git a/udp.c b/udp.c
index 6b72c30f..4444d762 100644
--- a/udp.c
+++ b/udp.c
@@ -138,20 +138,15 @@ static struct ethhdr udp4_eth_hdr;
 static struct ethhdr udp6_eth_hdr;
 
 /**
- * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets
+ * struct udp_meta_t - Pre-cooked headers for UDP packets
  * @ip6h:	Pre-filled IPv6 header (except for payload_len and addresses)
  * @ip4h:	Pre-filled IPv4 header (except for tot_len and saddr)
  * @taph:	Tap backend specific header
- * @s_in:	Source socket address, filled in by recvmmsg()
- * @tosidx:	sidx for the destination side of this datagram's flow
  */
 static struct udp_meta_t {
 	struct ipv6hdr ip6h;
 	struct iphdr ip4h;
 	struct tap_hdr taph;
-
-	union sockaddr_inany s_in;
-	flow_sidx_t tosidx;
 }
 #ifdef __AVX2__
 __attribute__ ((aligned(32)))
@@ -234,8 +229,6 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
 	tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
 	tiov[UDP_IOV_PAYLOAD].iov_base = payload;
 
-	mh->msg_name	= &meta->s_in;
-	mh->msg_namelen	= sizeof(meta->s_in);
 	mh->msg_iov	= siov;
 	mh->msg_iovlen	= 1;
 }
@@ -687,60 +680,32 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
 {
-	const socklen_t sasize = sizeof(udp_meta[0].s_in);
-	/* See udp_buf_sock_data() comment */
-	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
-
-	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv, n)) <= 0)
-		return;
-
-	/* We divide datagrams into batches based on how we need to send them,
-	 * determined by udp_meta[i].tosidx.  To avoid either two passes through
-	 * the array, or recalculating tosidx for a single entry, we have to
-	 * populate it one entry *ahead* of the loop counter.
-	 */
-	udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
-	udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
-	for (i = 0; i < n; ) {
-		flow_sidx_t batchsidx = udp_meta[i].tosidx;
-		uint8_t batchpif = pif_at_sidx(batchsidx);
-		int batchstart = i;
-
-		do {
-			if (pif_is_socket(batchpif)) {
-				udp_splice_prepare(udp_mh_recv, i);
-			} else if (batchpif == PIF_TAP) {
-				udp_tap_prepare(udp_mh_recv, i,
-						flowside_at_sidx(batchsidx),
-						false);
-			}
-
-			if (++i >= n)
-				break;
-
-			udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
-								&udp_meta[i].s_in,
-								now);
-			udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
-		} while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));
-
-		if (pif_is_socket(batchpif)) {
-			udp_splice_send(c, batchstart, i - batchstart,
-					batchsidx);
-		} else if (batchpif == PIF_TAP) {
-			tap_send_frames(c, &udp_l2_iov[batchstart][0],
-					UDP_NUM_IOVS, i - batchstart);
-		} else if (flow_sidx_valid(batchsidx)) {
-			flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx);
-			struct udp_flow *uflow = udp_at_sidx(batchsidx);
+	union sockaddr_inany src;
+
+	while (udp_peek_addr(ref.fd, &src) == 0) {
+		flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now);
+		uint8_t topif = pif_at_sidx(tosidx);
+
+		if (udp_sock_recv(c, ref.fd, udp_mh_recv, 1) <= 0)
+			break;
+
+		if (pif_is_socket(topif)) {
+			udp_splice_prepare(udp_mh_recv, 0);
+			udp_splice_send(c, 0, 1, tosidx);
+		} else if (topif == PIF_TAP) {
+			udp_tap_prepare(udp_mh_recv, 0, flowside_at_sidx(tosidx),
+					false);
+			tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, 1);
+		} else if (flow_sidx_valid(tosidx)) {
+			flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
+			struct udp_flow *uflow = udp_at_sidx(tosidx);
 
 			flow_err(uflow,
 				 "No support for forwarding UDP from %s to %s",
 				 pif_name(pif_at_sidx(fromsidx)),
-				 pif_name(batchpif));
+				 pif_name(topif));
 		} else {
-			debug("Discarding %d datagrams without flow",
-			      i - batchstart);
+			debug("Discarding datagram without flow");
 		}
 	}
 }
@@ -802,8 +767,6 @@ static bool udp_buf_reply_sock_data(const struct ctx *c,
 			udp_splice_prepare(udp_mh_recv, i);
 		else if (topif == PIF_TAP)
 			udp_tap_prepare(udp_mh_recv, i, toside, false);
-		/* Restore sockaddr length clobbered by recvmsg() */
-		udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
 	}
 
 	if (pif_is_socket(topif)) {
-- 
@@ -138,20 +138,15 @@ static struct ethhdr udp4_eth_hdr;
 static struct ethhdr udp6_eth_hdr;
 
 /**
- * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets
+ * struct udp_meta_t - Pre-cooked headers for UDP packets
  * @ip6h:	Pre-filled IPv6 header (except for payload_len and addresses)
  * @ip4h:	Pre-filled IPv4 header (except for tot_len and saddr)
  * @taph:	Tap backend specific header
- * @s_in:	Source socket address, filled in by recvmmsg()
- * @tosidx:	sidx for the destination side of this datagram's flow
  */
 static struct udp_meta_t {
 	struct ipv6hdr ip6h;
 	struct iphdr ip4h;
 	struct tap_hdr taph;
-
-	union sockaddr_inany s_in;
-	flow_sidx_t tosidx;
 }
 #ifdef __AVX2__
 __attribute__ ((aligned(32)))
@@ -234,8 +229,6 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
 	tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
 	tiov[UDP_IOV_PAYLOAD].iov_base = payload;
 
-	mh->msg_name	= &meta->s_in;
-	mh->msg_namelen	= sizeof(meta->s_in);
 	mh->msg_iov	= siov;
 	mh->msg_iovlen	= 1;
 }
@@ -687,60 +680,32 @@ static int udp_sock_recv(const struct ctx *c, int s, struct mmsghdr *mmh, int n)
 static void udp_buf_listen_sock_data(const struct ctx *c, union epoll_ref ref,
 				     const struct timespec *now)
 {
-	const socklen_t sasize = sizeof(udp_meta[0].s_in);
-	/* See udp_buf_sock_data() comment */
-	int n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES), i;
-
-	if ((n = udp_sock_recv(c, ref.fd, udp_mh_recv, n)) <= 0)
-		return;
-
-	/* We divide datagrams into batches based on how we need to send them,
-	 * determined by udp_meta[i].tosidx.  To avoid either two passes through
-	 * the array, or recalculating tosidx for a single entry, we have to
-	 * populate it one entry *ahead* of the loop counter.
-	 */
-	udp_meta[0].tosidx = udp_flow_from_sock(c, ref, &udp_meta[0].s_in, now);
-	udp_mh_recv[0].msg_hdr.msg_namelen = sasize;
-	for (i = 0; i < n; ) {
-		flow_sidx_t batchsidx = udp_meta[i].tosidx;
-		uint8_t batchpif = pif_at_sidx(batchsidx);
-		int batchstart = i;
-
-		do {
-			if (pif_is_socket(batchpif)) {
-				udp_splice_prepare(udp_mh_recv, i);
-			} else if (batchpif == PIF_TAP) {
-				udp_tap_prepare(udp_mh_recv, i,
-						flowside_at_sidx(batchsidx),
-						false);
-			}
-
-			if (++i >= n)
-				break;
-
-			udp_meta[i].tosidx = udp_flow_from_sock(c, ref,
-								&udp_meta[i].s_in,
-								now);
-			udp_mh_recv[i].msg_hdr.msg_namelen = sasize;
-		} while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx));
-
-		if (pif_is_socket(batchpif)) {
-			udp_splice_send(c, batchstart, i - batchstart,
-					batchsidx);
-		} else if (batchpif == PIF_TAP) {
-			tap_send_frames(c, &udp_l2_iov[batchstart][0],
-					UDP_NUM_IOVS, i - batchstart);
-		} else if (flow_sidx_valid(batchsidx)) {
-			flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx);
-			struct udp_flow *uflow = udp_at_sidx(batchsidx);
+	union sockaddr_inany src;
+
+	while (udp_peek_addr(ref.fd, &src) == 0) {
+		flow_sidx_t tosidx = udp_flow_from_sock(c, ref, &src, now);
+		uint8_t topif = pif_at_sidx(tosidx);
+
+		if (udp_sock_recv(c, ref.fd, udp_mh_recv, 1) <= 0)
+			break;
+
+		if (pif_is_socket(topif)) {
+			udp_splice_prepare(udp_mh_recv, 0);
+			udp_splice_send(c, 0, 1, tosidx);
+		} else if (topif == PIF_TAP) {
+			udp_tap_prepare(udp_mh_recv, 0, flowside_at_sidx(tosidx),
+					false);
+			tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, 1);
+		} else if (flow_sidx_valid(tosidx)) {
+			flow_sidx_t fromsidx = flow_sidx_opposite(tosidx);
+			struct udp_flow *uflow = udp_at_sidx(tosidx);
 
 			flow_err(uflow,
 				 "No support for forwarding UDP from %s to %s",
 				 pif_name(pif_at_sidx(fromsidx)),
-				 pif_name(batchpif));
+				 pif_name(topif));
 		} else {
-			debug("Discarding %d datagrams without flow",
-			      i - batchstart);
+			debug("Discarding datagram without flow");
 		}
 	}
 }
@@ -802,8 +767,6 @@ static bool udp_buf_reply_sock_data(const struct ctx *c,
 			udp_splice_prepare(udp_mh_recv, i);
 		else if (topif == PIF_TAP)
 			udp_tap_prepare(udp_mh_recv, i, toside, false);
-		/* Restore sockaddr length clobbered by recvmsg() */
-		udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in);
 	}
 
 	if (pif_is_socket(topif)) {
-- 
2.49.0


  parent reply	other threads:[~2025-04-04 10:15 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-04 10:15 [PATCH 00/12] Use connect()ed sockets for both sides of UDP flows David Gibson
2025-04-04 10:15 ` [PATCH 01/12] udp: Use connect()ed sockets for initiating side David Gibson
2025-04-04 10:15 ` [PATCH 02/12] udp: Make udp_sock_recv() take max number of frames as a parameter David Gibson
2025-04-04 10:15 ` [PATCH 03/12] udp: Polish udp_vu_sock_info() and remove from vu specific code David Gibson
2025-04-04 10:15 ` David Gibson [this message]
2025-04-04 10:15 ` [PATCH 05/12] udp: Parameterize number of datagrams handled by udp_*_reply_sock_data() David Gibson
2025-04-04 10:15 ` [PATCH 06/12] udp: Split spliced forwarding path from udp_buf_reply_sock_data() David Gibson
2025-04-04 10:15 ` [PATCH 07/12] udp: Merge vhost-user and "buf" listening socket paths David Gibson
2025-04-04 10:15 ` [PATCH 08/12] udp: Move UDP_MAX_FRAMES to udp.c David Gibson
2025-04-04 10:15 ` [PATCH 09/12] udp_flow: Take pif and port as explicit parameters to udp_flow_from_sock() David Gibson
2025-04-04 10:15 ` [PATCH 10/12] udp: Rework udp_listen_sock_data() into udp_sock_fwd() David Gibson
2025-04-04 10:15 ` [PATCH 11/12] udp: Fold udp_splice_prepare and udp_splice_send into udp_sock_to_sock David Gibson
2025-04-04 10:15 ` [PATCH 12/12] udp_flow: Don't discard packets that arrive between bind() and connect() David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250404101542.3729316-5-david@gibson.dropbear.id.au \
    --to=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).