public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications
@ 2022-12-05  8:14 David Gibson
  2022-12-05  8:14 ` [PATCH 1/8] udp: Move sending pasta tap frames to the end of udp_sock_handler() David Gibson
                   ` (8 more replies)
  0 siblings, 9 replies; 28+ messages in thread
From: David Gibson @ 2022-12-05  8:14 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

At present, the UDP "splice" and "tap" paths are quite separate.  We
have separate sockets to receive packets bound for the tap and splice
paths.  This leads to some code duplication, and extra open sockets.

This series partially unifies the two paths, allowing us to use a
single (host side) socket, bound to 0.0.0.0 or :: to receive packets
for both cases.

This is based on my earlier series with some fixes for the tap path.

David Gibson (8):
  udp: Move sending pasta tap frames to the end of udp_sock_handler()
  udp: Split sending to passt tap interface into separate function
  udp: Split receive from preparation and send in udp_sock_handler()
  udp: Receive multiple datagrams at once on the pasta sock->tap path
  udp: Pre-populate msg_names with local address
  udp: Unify udp_sock_handler_splice() with udp_sock_handler()
  udp: Decide whether to "splice" per datagram rather than per socket
  udp: Don't use separate sockets to listen for spliced packets

 udp.c  | 382 ++++++++++++++++++++++++++++++---------------------------
 udp.h  |   2 +-
 util.h |   7 ++
 3 files changed, 207 insertions(+), 184 deletions(-)

-- 
2.38.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/8] udp: Move sending pasta tap frames to the end of udp_sock_handler()
  2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
@ 2022-12-05  8:14 ` David Gibson
  2022-12-05  8:14 ` [PATCH 2/8] udp: Split sending to passt tap interface into separate function David Gibson
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 28+ messages in thread
From: David Gibson @ 2022-12-05  8:14 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

udp_sock_handler() has a surprising difference in flow between pasta and
passt mode: For pasta we send each frame to the tap interface as we prepare
it.  For passt, though, we prepare all the frames, then send them with a
single sendmmsg().

Alter the pasta path to also prepare all the frames, then send them at the
end.  We already have a suitable data structure for the passt case.  This
will make it easier to abstract out the tap backend difference in future.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 61 ++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/udp.c b/udp.c
index 4610a02..a27fe88 100644
--- a/udp.c
+++ b/udp.c
@@ -783,6 +783,35 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
 	return buf_len;
 }
 
+/**
+ * udp_tap_send_pasta() - Send datagrams to the pasta tap interface
+ * @c:		Execution context
+ * @mmh:	Array of message headers to send
+ * @n:		Number of message headers to send
+ *
+ * #syscalls:pasta write
+ */
+static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
+			       unsigned int n)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < n; i++) {
+		for (j = 0; j < mmh[i].msg_hdr.msg_iovlen; j++) {
+			struct iovec *iov = &mmh[i].msg_hdr.msg_iov[j];
+
+			/* We can't use writev() because the tap
+			 * character device relies on the write()
+			 * boundaries to discern frame boundaries
+			 */
+			if (write(c->fd_tap, iov->iov_base, iov->iov_len) < 0)
+				debug("tap write: %s", strerror(errno));
+			else
+				pcap(iov->iov_base, iov->iov_len);
+		}
+	}
+}
+
 /**
  * udp_sock_handler() - Handle new data from socket
  * @c:		Execution context
@@ -835,31 +864,25 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 		else
 			buf_len = udp_update_hdr4(c, i, dstport, now);
 
-		if (c->mode == MODE_PASTA) {
-			void *frame = tap_iov[i].iov_base;
+		tap_iov[i].iov_len = buf_len;
 
-			if (write(c->fd_tap, frame, buf_len) < 0)
-				debug("tap write: %s", strerror(errno));
-			pcap(frame, buf_len);
-		} else {
-			tap_iov[i].iov_len = buf_len;
-
-			/* With bigger messages, qemu closes the connection. */
-			if (msg_bufs && msg_len + buf_len > SHRT_MAX) {
-				tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
-				msg_i++;
-				tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i];
-				msg_len = msg_bufs = 0;
-			}
-
-			msg_len += buf_len;
-			msg_bufs++;
+		/* With bigger messages, qemu closes the connection. */
+		if (c->mode == MODE_PASST && msg_bufs &&
+		    msg_len + buf_len > SHRT_MAX) {
+			tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
+			msg_i++;
+			tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i];
+			msg_len = msg_bufs = 0;
 		}
+		msg_len += buf_len;
+		msg_bufs++;
 	}
 	tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
 
-	if (c->mode == MODE_PASTA)
+	if (c->mode == MODE_PASTA) {
+		udp_tap_send_pasta(c, tap_mmh, msg_i + 1);
 		return;
+	}
 
 	ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1,
 		       MSG_NOSIGNAL | MSG_DONTWAIT);
-- 
@@ -783,6 +783,35 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
 	return buf_len;
 }
 
+/**
+ * udp_tap_send_pasta() - Send datagrams to the pasta tap interface
+ * @c:		Execution context
+ * @mmh:	Array of message headers to send
+ * @n:		Number of message headers to send
+ *
+ * #syscalls:pasta write
+ */
+static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
+			       unsigned int n)
+{
+	unsigned int i, j;
+
+	for (i = 0; i < n; i++) {
+		for (j = 0; j < mmh[i].msg_hdr.msg_iovlen; j++) {
+			struct iovec *iov = &mmh[i].msg_hdr.msg_iov[j];
+
+			/* We can't use writev() because the tap
+			 * character device relies on the write()
+			 * boundaries to discern frame boundaries
+			 */
+			if (write(c->fd_tap, iov->iov_base, iov->iov_len) < 0)
+				debug("tap write: %s", strerror(errno));
+			else
+				pcap(iov->iov_base, iov->iov_len);
+		}
+	}
+}
+
 /**
  * udp_sock_handler() - Handle new data from socket
  * @c:		Execution context
@@ -835,31 +864,25 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 		else
 			buf_len = udp_update_hdr4(c, i, dstport, now);
 
-		if (c->mode == MODE_PASTA) {
-			void *frame = tap_iov[i].iov_base;
+		tap_iov[i].iov_len = buf_len;
 
-			if (write(c->fd_tap, frame, buf_len) < 0)
-				debug("tap write: %s", strerror(errno));
-			pcap(frame, buf_len);
-		} else {
-			tap_iov[i].iov_len = buf_len;
-
-			/* With bigger messages, qemu closes the connection. */
-			if (msg_bufs && msg_len + buf_len > SHRT_MAX) {
-				tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
-				msg_i++;
-				tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i];
-				msg_len = msg_bufs = 0;
-			}
-
-			msg_len += buf_len;
-			msg_bufs++;
+		/* With bigger messages, qemu closes the connection. */
+		if (c->mode == MODE_PASST && msg_bufs &&
+		    msg_len + buf_len > SHRT_MAX) {
+			tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
+			msg_i++;
+			tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i];
+			msg_len = msg_bufs = 0;
 		}
+		msg_len += buf_len;
+		msg_bufs++;
 	}
 	tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
 
-	if (c->mode == MODE_PASTA)
+	if (c->mode == MODE_PASTA) {
+		udp_tap_send_pasta(c, tap_mmh, msg_i + 1);
 		return;
+	}
 
 	ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1,
 		       MSG_NOSIGNAL | MSG_DONTWAIT);
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/8] udp: Split sending to passt tap interface into separate function
  2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
  2022-12-05  8:14 ` [PATCH 1/8] udp: Move sending pasta tap frames to the end of udp_sock_handler() David Gibson
@ 2022-12-05  8:14 ` David Gibson
  2022-12-05  8:14 ` [PATCH 3/8] udp: Split receive from preparation and send in udp_sock_handler() David Gibson
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 28+ messages in thread
From: David Gibson @ 2022-12-05  8:14 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

The last part of udp_sock_handler() does the actual on-sending of frames
to the tap interface.  For pasta that's just a call to udp_tap_send_pasta()
but for passt, it's a moderately complex and open coded.

For symmetry, move the passt send path into its own function,
udp_tap_send_passt().  This will make it easier to abstract the tap
interface in future (e.g. when we want to add vhost-user).

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 130 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 72 insertions(+), 58 deletions(-)

diff --git a/udp.c b/udp.c
index a27fe88..7281bc3 100644
--- a/udp.c
+++ b/udp.c
@@ -812,6 +812,73 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
 	}
 }
 
+/**
+ * udp_tap_send_passt() - Send datagrams to the passt tap interface
+ * @c:		Execution context
+ * @mmh:	Array of message headers to send
+ * @n:		Number of message headers to send
+ *
+ * #syscalls:passt sendmmsg sendmsg
+ */
+static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
+{
+	struct msghdr *last_mh;
+	ssize_t missing = 0;
+	size_t msg_len = 0;
+	unsigned int i;
+	int ret;
+
+	ret = sendmmsg(c->fd_tap, mmh, n, MSG_NOSIGNAL | MSG_DONTWAIT);
+	if (ret <= 0)
+		return;
+
+	/* If we lose some messages to sendmmsg() here, fine, it's UDP. However,
+	 * the last message needs to be delivered completely, otherwise qemu
+	 * will fail to reassemble the next message and close the connection. Go
+	 * through headers from the last sent message, counting bytes, and, if
+	 * and as soon as we see more bytes than sendmmsg() sent, re-send the
+	 * rest with a blocking call.
+	 *
+	 * In pictures, given this example:
+	 *
+	 *				 	iov #0  iov #1  iov #2  iov #3
+	 * tap_mmh[ret - 1].msg_hdr:		....    ......  .....   ......
+	 * tap_mmh[ret - 1].msg_len:	7	....    ...
+	 *
+	 * when 'msglen' reaches:	10		      ^
+	 * and 'missing' below is:	3	           ---
+	 *
+	 * re-send everything from here:		   ^--  -----   ------
+	 */
+	last_mh = &mmh[ret - 1].msg_hdr;
+	for (i = 0; i < last_mh->msg_iovlen; i++) {
+		if (missing <= 0) {
+			msg_len += last_mh->msg_iov[i].iov_len;
+			missing = msg_len - mmh[ret - 1].msg_len;
+		}
+
+		if (missing > 0) {
+			uint8_t **iov_base;
+			int first_offset;
+
+			iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base;
+			first_offset = last_mh->msg_iov[i].iov_len - missing;
+			*iov_base += first_offset;
+			last_mh->msg_iov[i].iov_len = missing;
+
+			last_mh->msg_iov = &last_mh->msg_iov[i];
+
+			if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0)
+				debug("UDP: %li bytes to tap missing", missing);
+
+			*iov_base -= first_offset;
+			break;
+		}
+	}
+
+	pcapmm(mmh, ret);
+}
+
 /**
  * udp_sock_handler() - Handle new data from socket
  * @c:		Execution context
@@ -820,16 +887,14 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
  * @now:	Current timestamp
  *
  * #syscalls recvmmsg
- * #syscalls:passt sendmmsg sendmsg
  */
 void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 		      const struct timespec *now)
 {
 	in_port_t dstport = ref.r.p.udp.udp.port;
-	ssize_t n, msg_len = 0, missing = 0;
 	struct mmsghdr *tap_mmh, *sock_mmh;
-	int msg_bufs = 0, msg_i = 0, ret;
-	struct msghdr *last_mh;
+	int msg_bufs = 0, msg_i = 0;
+	ssize_t n, msg_len = 0;
 	struct iovec *tap_iov;
 	unsigned int i;
 
@@ -879,61 +944,10 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 	}
 	tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
 
-	if (c->mode == MODE_PASTA) {
+	if (c->mode == MODE_PASTA)
 		udp_tap_send_pasta(c, tap_mmh, msg_i + 1);
-		return;
-	}
-
-	ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1,
-		       MSG_NOSIGNAL | MSG_DONTWAIT);
-	if (ret <= 0)
-		return;
-
-	/* If we lose some messages to sendmmsg() here, fine, it's UDP. However,
-	 * the last message needs to be delivered completely, otherwise qemu
-	 * will fail to reassemble the next message and close the connection. Go
-	 * through headers from the last sent message, counting bytes, and, if
-	 * and as soon as we see more bytes than sendmmsg() sent, re-send the
-	 * rest with a blocking call.
-	 *
-	 * In pictures, given this example:
-	 *
-	 *				 	iov #0  iov #1  iov #2  iov #3
-	 * tap_mmh[ret - 1].msg_hdr:		....    ......  .....   ......
-	 * tap_mmh[ret - 1].msg_len:	7	....    ...
-	 *
-	 * when 'msglen' reaches:	10		      ^
-	 * and 'missing' below is:	3	           ---
-	 *
-	 * re-send everything from here:		   ^--  -----   ------
-	 */
-	last_mh = &tap_mmh[ret - 1].msg_hdr;
-	for (i = 0, msg_len = 0; i < last_mh->msg_iovlen; i++) {
-		if (missing <= 0) {
-			msg_len += last_mh->msg_iov[i].iov_len;
-			missing = msg_len - tap_mmh[ret - 1].msg_len;
-		}
-
-		if (missing > 0) {
-			uint8_t **iov_base;
-			int first_offset;
-
-			iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base;
-			first_offset = last_mh->msg_iov[i].iov_len - missing;
-			*iov_base += first_offset;
-			last_mh->msg_iov[i].iov_len = missing;
-
-			last_mh->msg_iov = &last_mh->msg_iov[i];
-
-			if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0)
-				debug("UDP: %li bytes to tap missing", missing);
-
-			*iov_base -= first_offset;
-			break;
-		}
-	}
-
-	pcapmm(tap_mmh, ret);
+	else
+		udp_tap_send_passt(c, tap_mmh, msg_i + 1);
 }
 
 /**
-- 
@@ -812,6 +812,73 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
 	}
 }
 
+/**
+ * udp_tap_send_passt() - Send datagrams to the passt tap interface
+ * @c:		Execution context
+ * @mmh:	Array of message headers to send
+ * @n:		Number of message headers to send
+ *
+ * #syscalls:passt sendmmsg sendmsg
+ */
+static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
+{
+	struct msghdr *last_mh;
+	ssize_t missing = 0;
+	size_t msg_len = 0;
+	unsigned int i;
+	int ret;
+
+	ret = sendmmsg(c->fd_tap, mmh, n, MSG_NOSIGNAL | MSG_DONTWAIT);
+	if (ret <= 0)
+		return;
+
+	/* If we lose some messages to sendmmsg() here, fine, it's UDP. However,
+	 * the last message needs to be delivered completely, otherwise qemu
+	 * will fail to reassemble the next message and close the connection. Go
+	 * through headers from the last sent message, counting bytes, and, if
+	 * and as soon as we see more bytes than sendmmsg() sent, re-send the
+	 * rest with a blocking call.
+	 *
+	 * In pictures, given this example:
+	 *
+	 *				 	iov #0  iov #1  iov #2  iov #3
+	 * tap_mmh[ret - 1].msg_hdr:		....    ......  .....   ......
+	 * tap_mmh[ret - 1].msg_len:	7	....    ...
+	 *
+	 * when 'msglen' reaches:	10		      ^
+	 * and 'missing' below is:	3	           ---
+	 *
+	 * re-send everything from here:		   ^--  -----   ------
+	 */
+	last_mh = &mmh[ret - 1].msg_hdr;
+	for (i = 0; i < last_mh->msg_iovlen; i++) {
+		if (missing <= 0) {
+			msg_len += last_mh->msg_iov[i].iov_len;
+			missing = msg_len - mmh[ret - 1].msg_len;
+		}
+
+		if (missing > 0) {
+			uint8_t **iov_base;
+			int first_offset;
+
+			iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base;
+			first_offset = last_mh->msg_iov[i].iov_len - missing;
+			*iov_base += first_offset;
+			last_mh->msg_iov[i].iov_len = missing;
+
+			last_mh->msg_iov = &last_mh->msg_iov[i];
+
+			if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0)
+				debug("UDP: %li bytes to tap missing", missing);
+
+			*iov_base -= first_offset;
+			break;
+		}
+	}
+
+	pcapmm(mmh, ret);
+}
+
 /**
  * udp_sock_handler() - Handle new data from socket
  * @c:		Execution context
@@ -820,16 +887,14 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
  * @now:	Current timestamp
  *
  * #syscalls recvmmsg
- * #syscalls:passt sendmmsg sendmsg
  */
 void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 		      const struct timespec *now)
 {
 	in_port_t dstport = ref.r.p.udp.udp.port;
-	ssize_t n, msg_len = 0, missing = 0;
 	struct mmsghdr *tap_mmh, *sock_mmh;
-	int msg_bufs = 0, msg_i = 0, ret;
-	struct msghdr *last_mh;
+	int msg_bufs = 0, msg_i = 0;
+	ssize_t n, msg_len = 0;
 	struct iovec *tap_iov;
 	unsigned int i;
 
@@ -879,61 +944,10 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 	}
 	tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
 
-	if (c->mode == MODE_PASTA) {
+	if (c->mode == MODE_PASTA)
 		udp_tap_send_pasta(c, tap_mmh, msg_i + 1);
-		return;
-	}
-
-	ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1,
-		       MSG_NOSIGNAL | MSG_DONTWAIT);
-	if (ret <= 0)
-		return;
-
-	/* If we lose some messages to sendmmsg() here, fine, it's UDP. However,
-	 * the last message needs to be delivered completely, otherwise qemu
-	 * will fail to reassemble the next message and close the connection. Go
-	 * through headers from the last sent message, counting bytes, and, if
-	 * and as soon as we see more bytes than sendmmsg() sent, re-send the
-	 * rest with a blocking call.
-	 *
-	 * In pictures, given this example:
-	 *
-	 *				 	iov #0  iov #1  iov #2  iov #3
-	 * tap_mmh[ret - 1].msg_hdr:		....    ......  .....   ......
-	 * tap_mmh[ret - 1].msg_len:	7	....    ...
-	 *
-	 * when 'msglen' reaches:	10		      ^
-	 * and 'missing' below is:	3	           ---
-	 *
-	 * re-send everything from here:		   ^--  -----   ------
-	 */
-	last_mh = &tap_mmh[ret - 1].msg_hdr;
-	for (i = 0, msg_len = 0; i < last_mh->msg_iovlen; i++) {
-		if (missing <= 0) {
-			msg_len += last_mh->msg_iov[i].iov_len;
-			missing = msg_len - tap_mmh[ret - 1].msg_len;
-		}
-
-		if (missing > 0) {
-			uint8_t **iov_base;
-			int first_offset;
-
-			iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base;
-			first_offset = last_mh->msg_iov[i].iov_len - missing;
-			*iov_base += first_offset;
-			last_mh->msg_iov[i].iov_len = missing;
-
-			last_mh->msg_iov = &last_mh->msg_iov[i];
-
-			if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0)
-				debug("UDP: %li bytes to tap missing", missing);
-
-			*iov_base -= first_offset;
-			break;
-		}
-	}
-
-	pcapmm(tap_mmh, ret);
+	else
+		udp_tap_send_passt(c, tap_mmh, msg_i + 1);
 }
 
 /**
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/8] udp: Split receive from preparation and send in udp_sock_handler()
  2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
  2022-12-05  8:14 ` [PATCH 1/8] udp: Move sending pasta tap frames to the end of udp_sock_handler() David Gibson
  2022-12-05  8:14 ` [PATCH 2/8] udp: Split sending to passt tap interface into separate function David Gibson
@ 2022-12-05  8:14 ` David Gibson
  2022-12-05  8:14 ` [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path David Gibson
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 28+ messages in thread
From: David Gibson @ 2022-12-05  8:14 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

The receive part of udp_sock_handler() and udp_sock_handler_splice() is now
almost identical.  In preparation for merging that, split the receive part
of udp_sock_handler() from the part preparing and sending the frames for
sending on the tap interface.  The latter goes into a new udp_tap_send()
function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 79 +++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 27 deletions(-)

diff --git a/udp.c b/udp.c
index 7281bc3..64c9219 100644
--- a/udp.c
+++ b/udp.c
@@ -880,51 +880,39 @@ static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
 }
 
 /**
- * udp_sock_handler() - Handle new data from socket
+ * udp_tap_send() - Prepare UDP datagrams and send to tap interface
  * @c:		Execution context
- * @ref:	epoll reference
- * @events:	epoll events bitmap
+ * @start:	Index of first datagram in udp[46]_l2_buf pool
+ * @n:		Number of datagrams to send
+ * @dstport:	Destination port number
+ * @v6:		True if using IPv6
  * @now:	Current timestamp
  *
- * #syscalls recvmmsg
+ * Return: size of tap frame with headers
  */
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
-		      const struct timespec *now)
+static void udp_tap_send(const struct ctx *c,
+			 unsigned int start, unsigned int n,
+			 in_port_t dstport, bool v6, const struct timespec *now)
 {
-	in_port_t dstport = ref.r.p.udp.udp.port;
-	struct mmsghdr *tap_mmh, *sock_mmh;
 	int msg_bufs = 0, msg_i = 0;
-	ssize_t n, msg_len = 0;
+	struct mmsghdr *tap_mmh;
 	struct iovec *tap_iov;
+	ssize_t msg_len = 0;
 	unsigned int i;
 
-	if (events == EPOLLERR)
-		return;
-
-	if (ref.r.p.udp.udp.splice) {
-		udp_sock_handler_splice(c, ref, events, now);
-		return;
-	}
-
-	if (ref.r.p.udp.udp.v6) {
+	if (v6) {
 		tap_mmh = udp6_l2_mh_tap;
-		sock_mmh = udp6_l2_mh_sock;
 		tap_iov = udp6_l2_iov_tap;
 	} else {
 		tap_mmh = udp4_l2_mh_tap;
-		sock_mmh = udp4_l2_mh_sock;
 		tap_iov = udp4_l2_iov_tap;
 	}
 
-	n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
-	if (n <= 0)
-		return;
-
-	tap_mmh[0].msg_hdr.msg_iov = &tap_iov[0];
-	for (i = 0; i < (unsigned)n; i++) {
+	tap_mmh[0].msg_hdr.msg_iov = &tap_iov[start];
+	for (i = start; i < start + n; i++) {
 		size_t buf_len;
 
-		if (ref.r.p.udp.udp.v6)
+		if (v6)
 			buf_len = udp_update_hdr6(c, i, dstport, now);
 		else
 			buf_len = udp_update_hdr4(c, i, dstport, now);
@@ -950,6 +938,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 		udp_tap_send_passt(c, tap_mmh, msg_i + 1);
 }
 
+/**
+ * udp_sock_handler() - Handle new data from socket
+ * @c:		Execution context
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
+ *
+ * #syscalls recvmmsg
+ */
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+		      const struct timespec *now)
+{
+	in_port_t dstport = ref.r.p.udp.udp.port;
+	bool v6 = ref.r.p.udp.udp.v6;
+	struct mmsghdr *sock_mmh;
+	ssize_t n;
+
+	if (events == EPOLLERR)
+		return;
+
+	if (ref.r.p.udp.udp.splice) {
+		udp_sock_handler_splice(c, ref, events, now);
+		return;
+	}
+
+	if (ref.r.p.udp.udp.v6)
+		sock_mmh = udp6_l2_mh_sock;
+	else
+		sock_mmh = udp4_l2_mh_sock;
+
+	n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
+	if (n <= 0)
+		return;
+
+	udp_tap_send(c, 0, n, dstport, v6, now);
+}
+
 /**
  * udp_tap_handler() - Handle packets from tap
  * @c:		Execution context
-- 
@@ -880,51 +880,39 @@ static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
 }
 
 /**
- * udp_sock_handler() - Handle new data from socket
+ * udp_tap_send() - Prepare UDP datagrams and send to tap interface
  * @c:		Execution context
- * @ref:	epoll reference
- * @events:	epoll events bitmap
+ * @start:	Index of first datagram in udp[46]_l2_buf pool
+ * @n:		Number of datagrams to send
+ * @dstport:	Destination port number
+ * @v6:		True if using IPv6
  * @now:	Current timestamp
  *
- * #syscalls recvmmsg
+ * Return: size of tap frame with headers
  */
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
-		      const struct timespec *now)
+static void udp_tap_send(const struct ctx *c,
+			 unsigned int start, unsigned int n,
+			 in_port_t dstport, bool v6, const struct timespec *now)
 {
-	in_port_t dstport = ref.r.p.udp.udp.port;
-	struct mmsghdr *tap_mmh, *sock_mmh;
 	int msg_bufs = 0, msg_i = 0;
-	ssize_t n, msg_len = 0;
+	struct mmsghdr *tap_mmh;
 	struct iovec *tap_iov;
+	ssize_t msg_len = 0;
 	unsigned int i;
 
-	if (events == EPOLLERR)
-		return;
-
-	if (ref.r.p.udp.udp.splice) {
-		udp_sock_handler_splice(c, ref, events, now);
-		return;
-	}
-
-	if (ref.r.p.udp.udp.v6) {
+	if (v6) {
 		tap_mmh = udp6_l2_mh_tap;
-		sock_mmh = udp6_l2_mh_sock;
 		tap_iov = udp6_l2_iov_tap;
 	} else {
 		tap_mmh = udp4_l2_mh_tap;
-		sock_mmh = udp4_l2_mh_sock;
 		tap_iov = udp4_l2_iov_tap;
 	}
 
-	n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
-	if (n <= 0)
-		return;
-
-	tap_mmh[0].msg_hdr.msg_iov = &tap_iov[0];
-	for (i = 0; i < (unsigned)n; i++) {
+	tap_mmh[0].msg_hdr.msg_iov = &tap_iov[start];
+	for (i = start; i < start + n; i++) {
 		size_t buf_len;
 
-		if (ref.r.p.udp.udp.v6)
+		if (v6)
 			buf_len = udp_update_hdr6(c, i, dstport, now);
 		else
 			buf_len = udp_update_hdr4(c, i, dstport, now);
@@ -950,6 +938,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 		udp_tap_send_passt(c, tap_mmh, msg_i + 1);
 }
 
+/**
+ * udp_sock_handler() - Handle new data from socket
+ * @c:		Execution context
+ * @ref:	epoll reference
+ * @events:	epoll events bitmap
+ * @now:	Current timestamp
+ *
+ * #syscalls recvmmsg
+ */
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+		      const struct timespec *now)
+{
+	in_port_t dstport = ref.r.p.udp.udp.port;
+	bool v6 = ref.r.p.udp.udp.v6;
+	struct mmsghdr *sock_mmh;
+	ssize_t n;
+
+	if (events == EPOLLERR)
+		return;
+
+	if (ref.r.p.udp.udp.splice) {
+		udp_sock_handler_splice(c, ref, events, now);
+		return;
+	}
+
+	if (ref.r.p.udp.udp.v6)
+		sock_mmh = udp6_l2_mh_sock;
+	else
+		sock_mmh = udp4_l2_mh_sock;
+
+	n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
+	if (n <= 0)
+		return;
+
+	udp_tap_send(c, 0, n, dstport, v6, now);
+}
+
 /**
  * udp_tap_handler() - Handle packets from tap
  * @c:		Execution context
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
  2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
                   ` (2 preceding siblings ...)
  2022-12-05  8:14 ` [PATCH 3/8] udp: Split receive from preparation and send in udp_sock_handler() David Gibson
@ 2022-12-05  8:14 ` David Gibson
  2022-12-13 22:48   ` Stefano Brivio
  2022-12-05  8:14 ` [PATCH 5/8] udp: Pre-populate msg_names with local address David Gibson
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 28+ messages in thread
From: David Gibson @ 2022-12-05  8:14 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

Usually udp_sock_handler() will receive and forward multiple (up to 32)
datagrams in udp_sock_handler(), then forward them all to the tap
interface.  For unclear reasons, though, when in pasta mode we will only
receive and forward a single datagram at a time.  Change it to receive
multiple datagrams at once, like the other paths.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/udp.c b/udp.c
index 64c9219..24fa984 100644
--- a/udp.c
+++ b/udp.c
@@ -119,7 +119,6 @@
 
 #define UDP_CONN_TIMEOUT	180 /* s, timeout for ephemeral or local bind */
 #define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
-#define UDP_TAP_FRAMES		(c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1)
 
 /**
  * struct udp_tap_port - Port tracking based on tap-facing source port
@@ -968,7 +967,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 	else
 		sock_mmh = udp4_l2_mh_sock;
 
-	n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
+	n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL);
 	if (n <= 0)
 		return;
 
-- 
@@ -119,7 +119,6 @@
 
 #define UDP_CONN_TIMEOUT	180 /* s, timeout for ephemeral or local bind */
 #define UDP_MAX_FRAMES		32  /* max # of frames to receive at once */
-#define UDP_TAP_FRAMES		(c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1)
 
 /**
  * struct udp_tap_port - Port tracking based on tap-facing source port
@@ -968,7 +967,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 	else
 		sock_mmh = udp4_l2_mh_sock;
 
-	n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
+	n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL);
 	if (n <= 0)
 		return;
 
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 5/8] udp: Pre-populate msg_names with local address
  2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
                   ` (3 preceding siblings ...)
  2022-12-05  8:14 ` [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path David Gibson
@ 2022-12-05  8:14 ` David Gibson
  2022-12-13 22:48   ` Stefano Brivio
  2022-12-05  8:14 ` [PATCH 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler() David Gibson
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 28+ messages in thread
From: David Gibson @ 2022-12-05  8:14 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

udp_splice_namebuf is now used only for spliced sending, and so it only
ever populated with the localhost address, either IPv4 or IPv6.  So,
replace the awkward initialization in udp_sock_handler_splice() with
statically initialized versions for IPv4 and IPv6.  We then just need to
update the port number in udp_sock_handler_splice().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c  | 40 ++++++++++++++++++----------------------
 util.h |  7 +++++++
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/udp.c b/udp.c
index 24fa984..7c601cc 100644
--- a/udp.c
+++ b/udp.c
@@ -232,11 +232,18 @@ static struct mmsghdr	udp4_l2_mh_tap		[UDP_MAX_FRAMES];
 static struct mmsghdr	udp6_l2_mh_tap		[UDP_MAX_FRAMES];
 
 /* recvmmsg()/sendmmsg() data for "spliced" connections */
-static struct sockaddr_storage udp_splice_namebuf;
-
 static struct iovec	udp4_iov_splice		[UDP_MAX_FRAMES];
 static struct iovec	udp6_iov_splice		[UDP_MAX_FRAMES];
 
+static struct sockaddr_in udp_localname4 = {
+	.sin_family = AF_INET,
+	.sin_addr = IN4ADDR_LOOPBACK_INIT,
+};
+static struct sockaddr_in6 udp_localname6 = {
+	.sin6_family = AF_INET6,
+	.sin6_addr = IN6ADDR_LOOPBACK_INIT,
+};
+
 static struct mmsghdr	udp4_mh_splice		[UDP_MAX_FRAMES];
 static struct mmsghdr	udp6_mh_splice		[UDP_MAX_FRAMES];
 
@@ -610,23 +617,10 @@ static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref,
 	if (n <= 0)
 		return;
 
-	if (v6) {
-		*((struct sockaddr_in6 *)&udp_splice_namebuf) =
-		 ((struct sockaddr_in6) {
-			.sin6_family = AF_INET6,
-			.sin6_addr = IN6ADDR_LOOPBACK_INIT,
-			.sin6_port = htons(dst),
-			.sin6_scope_id = 0,
-		});
-	} else {
-		*((struct sockaddr_in *)&udp_splice_namebuf) =
-		 ((struct sockaddr_in) {
-			.sin_family = AF_INET,
-			.sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) },
-			.sin_port = htons(dst),
-			.sin_zero = { 0 },
-		});
-	}
+	if (v6)
+		udp_localname6.sin6_port = htons(dst);
+	else
+		udp_localname4.sin_port = htons(dst);
 
 	for (i = 0; i < n; i += m) {
 		in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
@@ -1252,9 +1246,11 @@ static void udp_splice_iov_init(void)
 		struct msghdr *mh4 = &udp4_mh_splice[i].msg_hdr;
 		struct msghdr *mh6 = &udp6_mh_splice[i].msg_hdr;
 
-		mh4->msg_name = mh6->msg_name = &udp_splice_namebuf;
-		mh4->msg_namelen = sizeof(udp_splice_namebuf);
-		mh6->msg_namelen = sizeof(udp_splice_namebuf);
+		mh4->msg_name = &udp_localname4;
+		mh4->msg_namelen = sizeof(udp_localname4);
+
+		mh6->msg_name = &udp_localname6;
+		mh6->msg_namelen = sizeof(udp_localname6);
 
 		udp4_iov_splice[i].iov_base = udp4_l2_buf[i].data;
 		udp6_iov_splice[i].iov_base = udp6_l2_buf[i].data;
diff --git a/util.h b/util.h
index 02b55a9..3baec91 100644
--- a/util.h
+++ b/util.h
@@ -79,6 +79,13 @@
 	(IN_MULTICAST(ntohl((a)->s_addr)))
 #define IN4_ARE_ADDR_EQUAL(a, b) \
 	(((struct in_addr *)(a))->s_addr == ((struct in_addr *)b)->s_addr)
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define IN4ADDR_LOOPBACK_INIT \
+	{ .s_addr	= INADDR_LOOPBACK }
+#else
+#define IN4ADDR_LOOPBACK_INIT \
+	{ .s_addr	= __bswap_constant_32(INADDR_LOOPBACK) }
+#endif
 
 #define NS_FN_STACK_SIZE	(RLIMIT_STACK_VAL * 1024 / 8)
 int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
-- 
@@ -79,6 +79,13 @@
 	(IN_MULTICAST(ntohl((a)->s_addr)))
 #define IN4_ARE_ADDR_EQUAL(a, b) \
 	(((struct in_addr *)(a))->s_addr == ((struct in_addr *)b)->s_addr)
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define IN4ADDR_LOOPBACK_INIT \
+	{ .s_addr	= INADDR_LOOPBACK }
+#else
+#define IN4ADDR_LOOPBACK_INIT \
+	{ .s_addr	= __bswap_constant_32(INADDR_LOOPBACK) }
+#endif
 
 #define NS_FN_STACK_SIZE	(RLIMIT_STACK_VAL * 1024 / 8)
 int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler()
  2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
                   ` (4 preceding siblings ...)
  2022-12-05  8:14 ` [PATCH 5/8] udp: Pre-populate msg_names with local address David Gibson
@ 2022-12-05  8:14 ` David Gibson
  2022-12-13 22:48   ` Stefano Brivio
  2022-12-05  8:14 ` [PATCH 7/8] udp: Decide whether to "splice" per datagram rather than per socket David Gibson
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 28+ messages in thread
From: David Gibson @ 2022-12-05  8:14 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

These two functions now have a very similar structure, and their first
part (calling recvmmsg()) is functionally identical.  So, merge the two
functions into one.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 86 +++++++++++++++++++----------------------------------------
 1 file changed, 28 insertions(+), 58 deletions(-)

diff --git a/udp.c b/udp.c
index 7c601cc..6ccfe8c 100644
--- a/udp.c
+++ b/udp.c
@@ -590,52 +590,6 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
 	sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL);
 }
 
-/**
- * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection
- * @c:		Execution context
- * @ref:	epoll reference
- * @events:	epoll events bitmap
- * @now:	Current timestamp
- */
-static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref,
-				    uint32_t events, const struct timespec *now)
-{
-	in_port_t dst = ref.r.p.udp.udp.port;
-	int v6 = ref.r.p.udp.udp.v6, n, i, m;
-	struct mmsghdr *mmh_recv;
-
-	if (!(events & EPOLLIN))
-		return;
-
-	if (v6)
-		mmh_recv = udp6_l2_mh_sock;
-	else
-		mmh_recv = udp4_l2_mh_sock;
-
-	n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
-
-	if (n <= 0)
-		return;
-
-	if (v6)
-		udp_localname6.sin6_port = htons(dst);
-	else
-		udp_localname4.sin_port = htons(dst);
-
-	for (i = 0; i < n; i += m) {
-		in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
-
-		for (m = 1; i + m < n; m++) {
-			void *mname = mmh_recv[i + m].msg_hdr.msg_name;
-			if (sa_port(v6, mname) != src)
-				break;
-		}
-
-		udp_splice_sendfrom(c, i, m, src, dst, v6, ref.r.p.udp.udp.ns,
-				    ref.r.p.udp.udp.orig, now);
-	}
-}
-
 /**
  * udp_update_hdr4() - Update headers for one IPv4 datagram
  * @c:		Execution context
@@ -945,27 +899,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 {
 	in_port_t dstport = ref.r.p.udp.udp.port;
 	bool v6 = ref.r.p.udp.udp.v6;
-	struct mmsghdr *sock_mmh;
+	struct mmsghdr *mmh_recv;
+	unsigned int i, m;
 	ssize_t n;
 
-	if (events == EPOLLERR)
+	if (!(events & EPOLLIN))
 		return;
 
-	if (ref.r.p.udp.udp.splice) {
-		udp_sock_handler_splice(c, ref, events, now);
-		return;
+	if (v6) {
+		mmh_recv = udp6_l2_mh_sock;
+		udp_localname6.sin6_port = htons(dstport);
+	} else {
+		mmh_recv = udp4_l2_mh_sock;
+		udp_localname4.sin_port = htons(dstport);
 	}
 
-	if (ref.r.p.udp.udp.v6)
-		sock_mmh = udp6_l2_mh_sock;
-	else
-		sock_mmh = udp4_l2_mh_sock;
-
-	n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL);
+	n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
 	if (n <= 0)
 		return;
 
-	udp_tap_send(c, 0, n, dstport, v6, now);
+	if (!ref.r.p.udp.udp.splice) {
+		udp_tap_send(c, 0, n, dstport, v6, now);
+		return;
+	}
+
+	for (i = 0; i < n; i += m) {
+		in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
+
+		for (m = 1; i + m < n; m++) {
+			void *mname = mmh_recv[i + m].msg_hdr.msg_name;
+			if (sa_port(v6, mname) != src)
+				break;
+		}
+
+		udp_splice_sendfrom(c, i, m, src, dstport, v6,
+				    ref.r.p.udp.udp.ns, ref.r.p.udp.udp.orig,
+				    now);
+	}
 }
 
 /**
-- 
@@ -590,52 +590,6 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
 	sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL);
 }
 
-/**
- * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection
- * @c:		Execution context
- * @ref:	epoll reference
- * @events:	epoll events bitmap
- * @now:	Current timestamp
- */
-static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref,
-				    uint32_t events, const struct timespec *now)
-{
-	in_port_t dst = ref.r.p.udp.udp.port;
-	int v6 = ref.r.p.udp.udp.v6, n, i, m;
-	struct mmsghdr *mmh_recv;
-
-	if (!(events & EPOLLIN))
-		return;
-
-	if (v6)
-		mmh_recv = udp6_l2_mh_sock;
-	else
-		mmh_recv = udp4_l2_mh_sock;
-
-	n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
-
-	if (n <= 0)
-		return;
-
-	if (v6)
-		udp_localname6.sin6_port = htons(dst);
-	else
-		udp_localname4.sin_port = htons(dst);
-
-	for (i = 0; i < n; i += m) {
-		in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
-
-		for (m = 1; i + m < n; m++) {
-			void *mname = mmh_recv[i + m].msg_hdr.msg_name;
-			if (sa_port(v6, mname) != src)
-				break;
-		}
-
-		udp_splice_sendfrom(c, i, m, src, dst, v6, ref.r.p.udp.udp.ns,
-				    ref.r.p.udp.udp.orig, now);
-	}
-}
-
 /**
  * udp_update_hdr4() - Update headers for one IPv4 datagram
  * @c:		Execution context
@@ -945,27 +899,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 {
 	in_port_t dstport = ref.r.p.udp.udp.port;
 	bool v6 = ref.r.p.udp.udp.v6;
-	struct mmsghdr *sock_mmh;
+	struct mmsghdr *mmh_recv;
+	unsigned int i, m;
 	ssize_t n;
 
-	if (events == EPOLLERR)
+	if (!(events & EPOLLIN))
 		return;
 
-	if (ref.r.p.udp.udp.splice) {
-		udp_sock_handler_splice(c, ref, events, now);
-		return;
+	if (v6) {
+		mmh_recv = udp6_l2_mh_sock;
+		udp_localname6.sin6_port = htons(dstport);
+	} else {
+		mmh_recv = udp4_l2_mh_sock;
+		udp_localname4.sin_port = htons(dstport);
 	}
 
-	if (ref.r.p.udp.udp.v6)
-		sock_mmh = udp6_l2_mh_sock;
-	else
-		sock_mmh = udp4_l2_mh_sock;
-
-	n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL);
+	n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
 	if (n <= 0)
 		return;
 
-	udp_tap_send(c, 0, n, dstport, v6, now);
+	if (!ref.r.p.udp.udp.splice) {
+		udp_tap_send(c, 0, n, dstport, v6, now);
+		return;
+	}
+
+	for (i = 0; i < n; i += m) {
+		in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
+
+		for (m = 1; i + m < n; m++) {
+			void *mname = mmh_recv[i + m].msg_hdr.msg_name;
+			if (sa_port(v6, mname) != src)
+				break;
+		}
+
+		udp_splice_sendfrom(c, i, m, src, dstport, v6,
+				    ref.r.p.udp.udp.ns, ref.r.p.udp.udp.orig,
+				    now);
+	}
 }
 
 /**
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 7/8] udp: Decide whether to "splice" per datagram rather than per socket
  2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
                   ` (5 preceding siblings ...)
  2022-12-05  8:14 ` [PATCH 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler() David Gibson
@ 2022-12-05  8:14 ` David Gibson
  2022-12-13 22:49   ` Stefano Brivio
  2022-12-05  8:14 ` [PATCH 8/8] udp: Don't use separate sockets to listen for spliced packets David Gibson
  2022-12-06  6:45 ` [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications Stefano Brivio
  8 siblings, 1 reply; 28+ messages in thread
From: David Gibson @ 2022-12-05  8:14 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

Currently we have special sockets for receiving datagrams from locahost
which can use the optimized "splice" path rather than going across the tap
interface.

We want to loosen this so that sockets can receive sockets that will be
forwarded by both the spliced and non-spliced paths.  To do this, we alter
the meaning of the @splice bit in the reference to mean that packets
receieved on this socket *can* be spliced, not that they *will* be spliced.
They'll only actually be spliced if they come from 127.0.0.1 or ::1.

We can't (for now) remove the splice bit entirely, unlike with TCP.  Our
gateway mapping means that if the ns initiates communication to the gw
address, we'll translate that to target 127.0.0.1 on the host side.  Reply
packets will therefore have source address 127.0.0.1 when received on the
host, but these need to go via the tap path where that will be translated
back to the gateway address.  We need the @splice bit to distinguish that
case from packets going from localhost to a port mapped explicitly with
-u which should be spliced.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 54 +++++++++++++++++++++++++++++++++++-------------------
 udp.h |  2 +-
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/udp.c b/udp.c
index 6ccfe8c..011a157 100644
--- a/udp.c
+++ b/udp.c
@@ -513,16 +513,27 @@ static int udp_splice_new_ns(void *arg)
 }
 
 /**
- * sa_port() - Determine port from a sockaddr_in or sockaddr_in6
+ * udp_mmh_splice_port() - Is source address of message suitable for splicing?
  * @v6:		Is @sa a sockaddr_in6 (otherwise sockaddr_in)?
- * @sa:		Pointer to either sockaddr_in or sockaddr_in6
+ * @mmh:	mmsghdr of incoming message
+ *
+ * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from
+ *         @sa, otherwise 0.
+ *
+ * NOTE: this relies on the fact that it's not valid to use UDP port 0
  */
-static in_port_t sa_port(bool v6, const void *sa)
+static in_port_t udp_mmh_splice_port(bool v6, const struct mmsghdr *mmh)
 {
-	const struct sockaddr_in6 *sa6 = sa;
-	const struct sockaddr_in *sa4 = sa;
+	const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name;
+	const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name;;
+
+	if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
+		return ntohs(sa6->sin6_port);
 
-	return v6 ? ntohs(sa6->sin6_port) : ntohs(sa4->sin_port);
+	if (ntohl(sa4->sin_addr.s_addr) == INADDR_LOOPBACK)
+		return ntohs(sa4->sin_port);
+
+	return 0;
 }
 
 /**
@@ -918,23 +929,28 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
 	if (n <= 0)
 		return;
 
-	if (!ref.r.p.udp.udp.splice) {
-		udp_tap_send(c, 0, n, dstport, v6, now);
-		return;
-	}
-
 	for (i = 0; i < n; i += m) {
-		in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
+		in_port_t splicefrom = 0;
+		m = n;
+
+		if (ref.r.p.udp.udp.splice) {
+			splicefrom = udp_mmh_splice_port(v6, mmh_recv + i);
 
-		for (m = 1; i + m < n; m++) {
-			void *mname = mmh_recv[i + m].msg_hdr.msg_name;
-			if (sa_port(v6, mname) != src)
-				break;
+			for (m = 1; i + m < n; m++) {
+				in_port_t p;
+
+				p = udp_mmh_splice_port(v6, mmh_recv + i + m);
+				if (p != splicefrom)
+					break;
+			}
 		}
 
-		udp_splice_sendfrom(c, i, m, src, dstport, v6,
-				    ref.r.p.udp.udp.ns, ref.r.p.udp.udp.orig,
-				    now);
+		if (splicefrom)
+			udp_splice_sendfrom(c, i, m, splicefrom, dstport,
+					    v6, ref.r.p.udp.udp.ns,
+					    ref.r.p.udp.udp.orig, now);
+		else
+			udp_tap_send(c, i, m, dstport, v6, now);
 	}
 }
 
diff --git a/udp.h b/udp.h
index 053991e..2a03335 100644
--- a/udp.h
+++ b/udp.h
@@ -22,7 +22,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
 /**
  * union udp_epoll_ref - epoll reference portion for TCP connections
  * @bound:		Set if this file descriptor is a bound socket
- * @splice:		Set if descriptor is associated to "spliced" connection
+ * @splice:		Set if descriptor packets to be "spliced"
  * @orig:		Set if a spliced socket which can originate "connections"
  * @ns:			Set if this is a socket in the pasta network namespace
  * @v6:			Set for IPv6 sockets or connections
-- 
@@ -22,7 +22,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
 /**
  * union udp_epoll_ref - epoll reference portion for TCP connections
  * @bound:		Set if this file descriptor is a bound socket
- * @splice:		Set if descriptor is associated to "spliced" connection
+ * @splice:		Set if descriptor packets to be "spliced"
  * @orig:		Set if a spliced socket which can originate "connections"
  * @ns:			Set if this is a socket in the pasta network namespace
  * @v6:			Set for IPv6 sockets or connections
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 8/8] udp: Don't use separate sockets to listen for spliced packets
  2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
                   ` (6 preceding siblings ...)
  2022-12-05  8:14 ` [PATCH 7/8] udp: Decide whether to "splice" per datagram rather than per socket David Gibson
@ 2022-12-05  8:14 ` David Gibson
  2022-12-06  6:45 ` [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications Stefano Brivio
  8 siblings, 0 replies; 28+ messages in thread
From: David Gibson @ 2022-12-05  8:14 UTC (permalink / raw)
  To: Stefano Brivio, passt-dev; +Cc: David Gibson

Currently, when ports are forwarded inbound in pasta mode, we open two
sockets for incoming traffic: one listens on the public IP address and will
forward packets to the tuntap interface.  The other listens on localhost
and forwards via "splicing" (resending directly via sockets in the ns).

Now that we've improved the logic about whether we "splice" any individual
packet, we don't need this.  Instead we can have a single socket bound to
0.0.0.0 or ::, marked as able to splice and udp_sock_handler() will deal
with each packet as appropriate.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 udp.c | 53 +++++++++++++----------------------------------------
 1 file changed, 13 insertions(+), 40 deletions(-)

diff --git a/udp.c b/udp.c
index 011a157..f7b9bdc 100644
--- a/udp.c
+++ b/udp.c
@@ -1118,7 +1118,6 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		   const void *addr, const char *ifname, in_port_t port)
 {
 	union udp_epoll_ref uref = { .u32 = 0 };
-	const void *bind_addr;
 	int s;
 
 	if (ns) {
@@ -1130,67 +1129,41 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 	}
 
 	if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
-		if (!addr && c->mode == MODE_PASTA)
-			bind_addr = &c->ip4.addr;
-		else
-			bind_addr = addr;
-
 		uref.udp.v6 = 0;
+		uref.udp.splice = (c->mode == MODE_PASTA);
+		uref.udp.orig = true;
 
 		if (!ns) {
-			uref.udp.splice = 0;
-			s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr, ifname,
+			s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, ifname,
 				    port, uref.u32);
 
 			udp_tap_map[V4][uref.udp.port].sock = s;
-
-			if (c->mode == MODE_PASTA) {
-				bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) };
-				uref.udp.splice = uref.udp.orig = true;
-
-				s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr,
-					    ifname, port, uref.u32);
-				udp_splice_init[V4][port].sock = s;
-			}
+			udp_splice_init[V4][port].sock = s;
 		} else {
-			uref.udp.splice = uref.udp.orig = uref.udp.ns = true;
-
-			bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) };
+			struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
+			uref.udp.ns = true;
 
-			s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr,
+			s = sock_l4(c, AF_INET, IPPROTO_UDP, &loopback,
 				    ifname, port, uref.u32);
 			udp_splice_ns[V4][port].sock = s;
 		}
 	}
 
 	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
-		if (!addr && c->mode == MODE_PASTA)
-			bind_addr = &c->ip6.addr;
-		else
-			bind_addr = addr;
-
 		uref.udp.v6 = 1;
+		uref.udp.splice = (c->mode == MODE_PASTA);
+		uref.udp.orig = true;
 
 		if (!ns) {
-			uref.udp.splice = 0;
-			s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, ifname,
+			s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, ifname,
 				    port, uref.u32);
 
 			udp_tap_map[V6][uref.udp.port].sock = s;
-
-			if (c->mode == MODE_PASTA) {
-				bind_addr = &in6addr_loopback;
-				uref.udp.splice = uref.udp.orig = true;
-
-				s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr,
-					    ifname, port, uref.u32);
-				udp_splice_init[V6][port].sock = s;
-			}
+			udp_splice_init[V6][port].sock = s;
 		} else {
-			bind_addr = &in6addr_loopback;
-			uref.udp.splice = uref.udp.orig = uref.udp.ns = true;
+			uref.udp.ns = true;
 
-			s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr,
+			s = sock_l4(c, AF_INET6, IPPROTO_UDP, &in6addr_loopback,
 				    ifname, port, uref.u32);
 			udp_splice_ns[V6][port].sock = s;
 		}
-- 
@@ -1118,7 +1118,6 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 		   const void *addr, const char *ifname, in_port_t port)
 {
 	union udp_epoll_ref uref = { .u32 = 0 };
-	const void *bind_addr;
 	int s;
 
 	if (ns) {
@@ -1130,67 +1129,41 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
 	}
 
 	if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
-		if (!addr && c->mode == MODE_PASTA)
-			bind_addr = &c->ip4.addr;
-		else
-			bind_addr = addr;
-
 		uref.udp.v6 = 0;
+		uref.udp.splice = (c->mode == MODE_PASTA);
+		uref.udp.orig = true;
 
 		if (!ns) {
-			uref.udp.splice = 0;
-			s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr, ifname,
+			s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, ifname,
 				    port, uref.u32);
 
 			udp_tap_map[V4][uref.udp.port].sock = s;
-
-			if (c->mode == MODE_PASTA) {
-				bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) };
-				uref.udp.splice = uref.udp.orig = true;
-
-				s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr,
-					    ifname, port, uref.u32);
-				udp_splice_init[V4][port].sock = s;
-			}
+			udp_splice_init[V4][port].sock = s;
 		} else {
-			uref.udp.splice = uref.udp.orig = uref.udp.ns = true;
-
-			bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) };
+			struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
+			uref.udp.ns = true;
 
-			s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr,
+			s = sock_l4(c, AF_INET, IPPROTO_UDP, &loopback,
 				    ifname, port, uref.u32);
 			udp_splice_ns[V4][port].sock = s;
 		}
 	}
 
 	if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
-		if (!addr && c->mode == MODE_PASTA)
-			bind_addr = &c->ip6.addr;
-		else
-			bind_addr = addr;
-
 		uref.udp.v6 = 1;
+		uref.udp.splice = (c->mode == MODE_PASTA);
+		uref.udp.orig = true;
 
 		if (!ns) {
-			uref.udp.splice = 0;
-			s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, ifname,
+			s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, ifname,
 				    port, uref.u32);
 
 			udp_tap_map[V6][uref.udp.port].sock = s;
-
-			if (c->mode == MODE_PASTA) {
-				bind_addr = &in6addr_loopback;
-				uref.udp.splice = uref.udp.orig = true;
-
-				s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr,
-					    ifname, port, uref.u32);
-				udp_splice_init[V6][port].sock = s;
-			}
+			udp_splice_init[V6][port].sock = s;
 		} else {
-			bind_addr = &in6addr_loopback;
-			uref.udp.splice = uref.udp.orig = uref.udp.ns = true;
+			uref.udp.ns = true;
 
-			s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr,
+			s = sock_l4(c, AF_INET6, IPPROTO_UDP, &in6addr_loopback,
 				    ifname, port, uref.u32);
 			udp_splice_ns[V6][port].sock = s;
 		}
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications
  2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
                   ` (7 preceding siblings ...)
  2022-12-05  8:14 ` [PATCH 8/8] udp: Don't use separate sockets to listen for spliced packets David Gibson
@ 2022-12-06  6:45 ` Stefano Brivio
  2022-12-06  6:46   ` Stefano Brivio
  8 siblings, 1 reply; 28+ messages in thread
From: Stefano Brivio @ 2022-12-06  6:45 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Mon,  5 Dec 2022 19:14:17 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> At present, the UDP "splice" and "tap" paths are quite separate.  We
> have separate sockets to receive packets bound for the tap and splice
> paths.  This leads to some code duplication, and extra open sockets.
> 
> This series partially unifies the two paths, allowing us to use a
> single (host side) socket, bound to 0.0.0.0 or :: to receive packets
> for both cases.
> 
> This is based on my earlier series with some fixes for the tap path.

Applied and pushed too.

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications
  2022-12-06  6:45 ` [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications Stefano Brivio
@ 2022-12-06  6:46   ` Stefano Brivio
  0 siblings, 0 replies; 28+ messages in thread
From: Stefano Brivio @ 2022-12-06  6:46 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Tue, 6 Dec 2022 07:45:42 +0100
Stefano Brivio <sbrivio@redhat.com> wrote:

> On Mon,  5 Dec 2022 19:14:17 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > At present, the UDP "splice" and "tap" paths are quite separate.  We
> > have separate sockets to receive packets bound for the tap and splice
> > paths.  This leads to some code duplication, and extra open sockets.
> > 
> > This series partially unifies the two paths, allowing us to use a
> > single (host side) socket, bound to 0.0.0.0 or :: to receive packets
> > for both cases.
> > 
> > This is based on my earlier series with some fixes for the tap path.  
> 
> Applied and pushed too.

Oops, sorry, not this one -- I'm still reviewing it.

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
  2022-12-05  8:14 ` [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path David Gibson
@ 2022-12-13 22:48   ` Stefano Brivio
  2022-12-14  1:42     ` David Gibson
  0 siblings, 1 reply; 28+ messages in thread
From: Stefano Brivio @ 2022-12-13 22:48 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

Sorry for the long delay here,

On Mon,  5 Dec 2022 19:14:21 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> Usually udp_sock_handler() will receive and forward multiple (up to 32)
> datagrams in udp_sock_handler(), then forward them all to the tap
> interface.  For unclear reasons, though, when in pasta mode we will only
> receive and forward a single datagram at a time.  Change it to receive
> multiple datagrams at once, like the other paths.

This is explained in the commit message of 6c931118643c ("tcp, udp:
Receive batching doesn't pay off when writing single frames to tap").

I think it's worth re-checking the throughput now as this path is a bit
different, but unfortunately I didn't include this in the "perf" tests :(
because at the time I introduced those I wasn't sure it even made sense to
have traffic from the same host being directed to the tap device.

The iperf3 runs were I observed this are actually the ones from the Podman
demo. Ideally that case should be also checked in the perf/pasta_udp tests.

How fundamental is this for the rest of the series? I couldn't find any
actual dependency on this but I might be missing something.

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/8] udp: Pre-populate msg_names with local address
  2022-12-05  8:14 ` [PATCH 5/8] udp: Pre-populate msg_names with local address David Gibson
@ 2022-12-13 22:48   ` Stefano Brivio
  2022-12-14  1:22     ` David Gibson
  0 siblings, 1 reply; 28+ messages in thread
From: Stefano Brivio @ 2022-12-13 22:48 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Mon,  5 Dec 2022 19:14:22 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> udp_splice_namebuf is now used only for spliced sending, and so it only
> ever populated with the localhost address, either IPv4 or IPv6.  So,
> replace the awkward initialization in udp_sock_handler_splice() with
> statically initialized versions for IPv4 and IPv6.  We then just need to
> update the port number in udp_sock_handler_splice().
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  udp.c  | 40 ++++++++++++++++++----------------------
>  util.h |  7 +++++++
>  2 files changed, 25 insertions(+), 22 deletions(-)
> 
> diff --git a/udp.c b/udp.c
> index 24fa984..7c601cc 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -232,11 +232,18 @@ static struct mmsghdr	udp4_l2_mh_tap		[UDP_MAX_FRAMES];
>  static struct mmsghdr	udp6_l2_mh_tap		[UDP_MAX_FRAMES];
>  
>  /* recvmmsg()/sendmmsg() data for "spliced" connections */
> -static struct sockaddr_storage udp_splice_namebuf;
> -
>  static struct iovec	udp4_iov_splice		[UDP_MAX_FRAMES];
>  static struct iovec	udp6_iov_splice		[UDP_MAX_FRAMES];
>  
> +static struct sockaddr_in udp_localname4 = {
> +	.sin_family = AF_INET,
> +	.sin_addr = IN4ADDR_LOOPBACK_INIT,
> +};
> +static struct sockaddr_in6 udp_localname6 = {
> +	.sin6_family = AF_INET6,
> +	.sin6_addr = IN6ADDR_LOOPBACK_INIT,
> +};

Nit, not a strong preference and not worth re-spinning just for this: I
think udp4_localname and udp6_localname would be more consistent with
everything else here, hence easier to type without double checking.

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler()
  2022-12-05  8:14 ` [PATCH 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler() David Gibson
@ 2022-12-13 22:48   ` Stefano Brivio
  2022-12-14  1:19     ` David Gibson
  0 siblings, 1 reply; 28+ messages in thread
From: Stefano Brivio @ 2022-12-13 22:48 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Mon,  5 Dec 2022 19:14:23 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> These two functions now have a very similar structure, and their first
> part (calling recvmmsg()) is functionally identical.  So, merge the two
> functions into one.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  udp.c | 86 +++++++++++++++++++----------------------------------------
>  1 file changed, 28 insertions(+), 58 deletions(-)
> 
> diff --git a/udp.c b/udp.c
> index 7c601cc..6ccfe8c 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -590,52 +590,6 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
>  	sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL);
>  }
>  
> -/**
> - * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection
> - * @c:		Execution context
> - * @ref:	epoll reference
> - * @events:	epoll events bitmap
> - * @now:	Current timestamp
> - */
> -static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref,
> -				    uint32_t events, const struct timespec *now)
> -{
> -	in_port_t dst = ref.r.p.udp.udp.port;
> -	int v6 = ref.r.p.udp.udp.v6, n, i, m;
> -	struct mmsghdr *mmh_recv;
> -
> -	if (!(events & EPOLLIN))
> -		return;
> -
> -	if (v6)
> -		mmh_recv = udp6_l2_mh_sock;
> -	else
> -		mmh_recv = udp4_l2_mh_sock;
> -
> -	n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
> -
> -	if (n <= 0)
> -		return;
> -
> -	if (v6)
> -		udp_localname6.sin6_port = htons(dst);
> -	else
> -		udp_localname4.sin_port = htons(dst);
> -
> -	for (i = 0; i < n; i += m) {
> -		in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
> -
> -		for (m = 1; i + m < n; m++) {
> -			void *mname = mmh_recv[i + m].msg_hdr.msg_name;
> -			if (sa_port(v6, mname) != src)
> -				break;
> -		}
> -
> -		udp_splice_sendfrom(c, i, m, src, dst, v6, ref.r.p.udp.udp.ns,
> -				    ref.r.p.udp.udp.orig, now);
> -	}
> -}
> -
>  /**
>   * udp_update_hdr4() - Update headers for one IPv4 datagram
>   * @c:		Execution context
> @@ -945,27 +899,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
>  {
>  	in_port_t dstport = ref.r.p.udp.udp.port;
>  	bool v6 = ref.r.p.udp.udp.v6;
> -	struct mmsghdr *sock_mmh;
> +	struct mmsghdr *mmh_recv;
> +	unsigned int i, m;
>  	ssize_t n;
>  
> -	if (events == EPOLLERR)
> +	if (!(events & EPOLLIN))

Pre-existing, unrelated issue, but this reminds me: we don't handle
socket errors here, and while udp_timer_one() will drop any sockets we
created, eventually, it would probably be better to act right away.

Not that I have in mind a valid example of an error on UDP sockets,
except perhaps if the interface goes down (but we'll handle that
separately).

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/8] udp: Decide whether to "splice" per datagram rather than per socket
  2022-12-05  8:14 ` [PATCH 7/8] udp: Decide whether to "splice" per datagram rather than per socket David Gibson
@ 2022-12-13 22:49   ` Stefano Brivio
  2022-12-14  1:47     ` David Gibson
  0 siblings, 1 reply; 28+ messages in thread
From: Stefano Brivio @ 2022-12-13 22:49 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Mon,  5 Dec 2022 19:14:24 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> Currently we have special sockets for receiving datagrams from locahost
> which can use the optimized "splice" path rather than going across the tap
> interface.
> 
> We want to loosen this so that sockets can receive sockets that will be
> forwarded by both the spliced and non-spliced paths.  To do this, we alter
> the meaning of the @splice bit in the reference to mean that packets
> receieved on this socket *can* be spliced, not that they *will* be spliced.
> They'll only actually be spliced if they come from 127.0.0.1 or ::1.
> 
> We can't (for now) remove the splice bit entirely, unlike with TCP.  Our
> gateway mapping means that if the ns initiates communication to the gw
> address, we'll translate that to target 127.0.0.1 on the host side.  Reply
> packets will therefore have source address 127.0.0.1 when received on the
> host, but these need to go via the tap path where that will be translated
> back to the gateway address.  We need the @splice bit to distinguish that
> case from packets going from localhost to a port mapped explicitly with
> -u which should be spliced.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  udp.c | 54 +++++++++++++++++++++++++++++++++++-------------------
>  udp.h |  2 +-
>  2 files changed, 36 insertions(+), 20 deletions(-)
> 
> diff --git a/udp.c b/udp.c
> index 6ccfe8c..011a157 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -513,16 +513,27 @@ static int udp_splice_new_ns(void *arg)
>  }
>  
>  /**
> - * sa_port() - Determine port from a sockaddr_in or sockaddr_in6
> + * udp_mmh_splice_port() - Is source address of message suitable for splicing?
>   * @v6:		Is @sa a sockaddr_in6 (otherwise sockaddr_in)?
> - * @sa:		Pointer to either sockaddr_in or sockaddr_in6
> + * @mmh:	mmsghdr of incoming message
> + *
> + * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from
> + *         @sa, otherwise 0.
> + *
> + * NOTE: this relies on the fact that it's not valid to use UDP port 0

The port is reserved by IANA indeed, but... it can actually be used. On
Linux, you can bind() it and you can connect() to it. As far as I can
tell from the new version of udp_sock_handler() we would actually
misdirect packets in that case.

How bad would it be to use an int here?

By the way, I think the comment should also mention that the port is
returned in host order.

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler()
  2022-12-13 22:48   ` Stefano Brivio
@ 2022-12-14  1:19     ` David Gibson
  2022-12-14 10:35       ` Stefano Brivio
  0 siblings, 1 reply; 28+ messages in thread
From: David Gibson @ 2022-12-14  1:19 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 3321 bytes --]

On Tue, Dec 13, 2022 at 11:48:58PM +0100, Stefano Brivio wrote:
> On Mon,  5 Dec 2022 19:14:23 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > These two functions now have a very similar structure, and their first
> > part (calling recvmmsg()) is functionally identical.  So, merge the two
> > functions into one.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  udp.c | 86 +++++++++++++++++++----------------------------------------
> >  1 file changed, 28 insertions(+), 58 deletions(-)
> > 
> > diff --git a/udp.c b/udp.c
> > index 7c601cc..6ccfe8c 100644
> > --- a/udp.c
> > +++ b/udp.c
> > @@ -590,52 +590,6 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
> >  	sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL);
> >  }
> >  
> > -/**
> > - * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection
> > - * @c:		Execution context
> > - * @ref:	epoll reference
> > - * @events:	epoll events bitmap
> > - * @now:	Current timestamp
> > - */
> > -static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref,
> > -				    uint32_t events, const struct timespec *now)
> > -{
> > -	in_port_t dst = ref.r.p.udp.udp.port;
> > -	int v6 = ref.r.p.udp.udp.v6, n, i, m;
> > -	struct mmsghdr *mmh_recv;
> > -
> > -	if (!(events & EPOLLIN))
> > -		return;
> > -
> > -	if (v6)
> > -		mmh_recv = udp6_l2_mh_sock;
> > -	else
> > -		mmh_recv = udp4_l2_mh_sock;
> > -
> > -	n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
> > -
> > -	if (n <= 0)
> > -		return;
> > -
> > -	if (v6)
> > -		udp_localname6.sin6_port = htons(dst);
> > -	else
> > -		udp_localname4.sin_port = htons(dst);
> > -
> > -	for (i = 0; i < n; i += m) {
> > -		in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
> > -
> > -		for (m = 1; i + m < n; m++) {
> > -			void *mname = mmh_recv[i + m].msg_hdr.msg_name;
> > -			if (sa_port(v6, mname) != src)
> > -				break;
> > -		}
> > -
> > -		udp_splice_sendfrom(c, i, m, src, dst, v6, ref.r.p.udp.udp.ns,
> > -				    ref.r.p.udp.udp.orig, now);
> > -	}
> > -}
> > -
> >  /**
> >   * udp_update_hdr4() - Update headers for one IPv4 datagram
> >   * @c:		Execution context
> > @@ -945,27 +899,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
> >  {
> >  	in_port_t dstport = ref.r.p.udp.udp.port;
> >  	bool v6 = ref.r.p.udp.udp.v6;
> > -	struct mmsghdr *sock_mmh;
> > +	struct mmsghdr *mmh_recv;
> > +	unsigned int i, m;
> >  	ssize_t n;
> >  
> > -	if (events == EPOLLERR)
> > +	if (!(events & EPOLLIN))
> 
> Pre-existing, unrelated issue, but this reminds me: we don't handle
> socket errors here, and while udp_timer_one() will drop any sockets we
> created, eventually, it would probably be better to act right away.

Ok... I'm not sure what, if anything, you would like me to do about
it, however.

> Not that I have in mind a valid example of an error on UDP sockets,
> except perhaps if the interface goes down (but we'll handle that
> separately).

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 5/8] udp: Pre-populate msg_names with local address
  2022-12-13 22:48   ` Stefano Brivio
@ 2022-12-14  1:22     ` David Gibson
  0 siblings, 0 replies; 28+ messages in thread
From: David Gibson @ 2022-12-14  1:22 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 2019 bytes --]

On Tue, Dec 13, 2022 at 11:48:52PM +0100, Stefano Brivio wrote:
> On Mon,  5 Dec 2022 19:14:22 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > udp_splice_namebuf is now used only for spliced sending, and so it only
> > ever populated with the localhost address, either IPv4 or IPv6.  So,
> > replace the awkward initialization in udp_sock_handler_splice() with
> > statically initialized versions for IPv4 and IPv6.  We then just need to
> > update the port number in udp_sock_handler_splice().
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  udp.c  | 40 ++++++++++++++++++----------------------
> >  util.h |  7 +++++++
> >  2 files changed, 25 insertions(+), 22 deletions(-)
> > 
> > diff --git a/udp.c b/udp.c
> > index 24fa984..7c601cc 100644
> > --- a/udp.c
> > +++ b/udp.c
> > @@ -232,11 +232,18 @@ static struct mmsghdr	udp4_l2_mh_tap		[UDP_MAX_FRAMES];
> >  static struct mmsghdr	udp6_l2_mh_tap		[UDP_MAX_FRAMES];
> >  
> >  /* recvmmsg()/sendmmsg() data for "spliced" connections */
> > -static struct sockaddr_storage udp_splice_namebuf;
> > -
> >  static struct iovec	udp4_iov_splice		[UDP_MAX_FRAMES];
> >  static struct iovec	udp6_iov_splice		[UDP_MAX_FRAMES];
> >  
> > +static struct sockaddr_in udp_localname4 = {
> > +	.sin_family = AF_INET,
> > +	.sin_addr = IN4ADDR_LOOPBACK_INIT,
> > +};
> > +static struct sockaddr_in6 udp_localname6 = {
> > +	.sin6_family = AF_INET6,
> > +	.sin6_addr = IN6ADDR_LOOPBACK_INIT,
> > +};
> 
> Nit, not a strong preference and not worth re-spinning just for this: I
> think udp4_localname and udp6_localname would be more consistent with
> everything else here, hence easier to type without double checking.

Good point.  I'll change it only if I need a respin for other reasons.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
  2022-12-13 22:48   ` Stefano Brivio
@ 2022-12-14  1:42     ` David Gibson
  2022-12-14 10:35       ` Stefano Brivio
  0 siblings, 1 reply; 28+ messages in thread
From: David Gibson @ 2022-12-14  1:42 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 1791 bytes --]

On Tue, Dec 13, 2022 at 11:48:47PM +0100, Stefano Brivio wrote:
> Sorry for the long delay here,
> 
> On Mon,  5 Dec 2022 19:14:21 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > Usually udp_sock_handler() will receive and forward multiple (up to 32)
> > datagrams in udp_sock_handler(), then forward them all to the tap
> > interface.  For unclear reasons, though, when in pasta mode we will only
> > receive and forward a single datagram at a time.  Change it to receive
> > multiple datagrams at once, like the other paths.
> 
> This is explained in the commit message of 6c931118643c ("tcp, udp:
> Receive batching doesn't pay off when writing single frames to tap").
> 
> I think it's worth re-checking the throughput now as this path is a bit
> different, but unfortunately I didn't include this in the "perf" tests :(
> because at the time I introduced those I wasn't sure it even made sense to
> have traffic from the same host being directed to the tap device.
> 
> The iperf3 runs were I observed this are actually the ones from the Podman
> demo. Ideally that case should be also checked in the perf/pasta_udp tests.

Hm, ok.

> How fundamental is this for the rest of the series? I couldn't find any
> actual dependency on this but I might be missing something.

So the issue is that prior to this change in pasta we receive multiple
frames at once on the splice path, but one frame at a time on the tap
path.  By the end of this series we can't do that any more, because we
don't know before the recvmmsg() which one we'll be doing.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/8] udp: Decide whether to "splice" per datagram rather than per socket
  2022-12-13 22:49   ` Stefano Brivio
@ 2022-12-14  1:47     ` David Gibson
  2022-12-14 10:35       ` Stefano Brivio
  0 siblings, 1 reply; 28+ messages in thread
From: David Gibson @ 2022-12-14  1:47 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 3466 bytes --]

On Tue, Dec 13, 2022 at 11:49:18PM +0100, Stefano Brivio wrote:
> On Mon,  5 Dec 2022 19:14:24 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > Currently we have special sockets for receiving datagrams from locahost
> > which can use the optimized "splice" path rather than going across the tap
> > interface.
> > 
> > We want to loosen this so that sockets can receive sockets that will be
> > forwarded by both the spliced and non-spliced paths.  To do this, we alter
> > the meaning of the @splice bit in the reference to mean that packets
> > receieved on this socket *can* be spliced, not that they *will* be spliced.
> > They'll only actually be spliced if they come from 127.0.0.1 or ::1.
> > 
> > We can't (for now) remove the splice bit entirely, unlike with TCP.  Our
> > gateway mapping means that if the ns initiates communication to the gw
> > address, we'll translate that to target 127.0.0.1 on the host side.  Reply
> > packets will therefore have source address 127.0.0.1 when received on the
> > host, but these need to go via the tap path where that will be translated
> > back to the gateway address.  We need the @splice bit to distinguish that
> > case from packets going from localhost to a port mapped explicitly with
> > -u which should be spliced.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  udp.c | 54 +++++++++++++++++++++++++++++++++++-------------------
> >  udp.h |  2 +-
> >  2 files changed, 36 insertions(+), 20 deletions(-)
> > 
> > diff --git a/udp.c b/udp.c
> > index 6ccfe8c..011a157 100644
> > --- a/udp.c
> > +++ b/udp.c
> > @@ -513,16 +513,27 @@ static int udp_splice_new_ns(void *arg)
> >  }
> >  
> >  /**
> > - * sa_port() - Determine port from a sockaddr_in or sockaddr_in6
> > + * udp_mmh_splice_port() - Is source address of message suitable for splicing?
> >   * @v6:		Is @sa a sockaddr_in6 (otherwise sockaddr_in)?
> > - * @sa:		Pointer to either sockaddr_in or sockaddr_in6
> > + * @mmh:	mmsghdr of incoming message
> > + *
> > + * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from
> > + *         @sa, otherwise 0.
> > + *
> > + * NOTE: this relies on the fact that it's not valid to use UDP port 0
> 
> The port is reserved by IANA indeed, but... it can actually be used. On
> Linux, you can bind() it and you can connect() to it. As far as I can
> tell from the new version of udp_sock_handler() we would actually
> misdirect packets in that case.

Hm, ok.  Given the IANA reservation, I think it would be acceptable to
simply drop such packets - but if we were to make that choice we
should do so explicitly, rather than misdirecting them.

> How bad would it be to use an int here?

Pretty straightforward.  Just means we have to use the somewhat
abtruse "if (port <= USHRT_MAX)" or "if (port >= 0)" or something
instead of just "if (port)".  Should I go ahead and make that change?

> By the way, I think the comment should also mention that the port is
> returned in host order.

Ok, easily done.  Generally I try to keep the endianness associated
with the type, rather than attempting to document it for each variable
(or even worse, each point in time for each variable).

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler()
  2022-12-14  1:19     ` David Gibson
@ 2022-12-14 10:35       ` Stefano Brivio
  0 siblings, 0 replies; 28+ messages in thread
From: Stefano Brivio @ 2022-12-14 10:35 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Wed, 14 Dec 2022 12:19:14 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Tue, Dec 13, 2022 at 11:48:58PM +0100, Stefano Brivio wrote:
> > On Mon,  5 Dec 2022 19:14:23 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > These two functions now have a very similar structure, and their first
> > > part (calling recvmmsg()) is functionally identical.  So, merge the two
> > > functions into one.
> > > 
> > > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > > ---
> > >  udp.c | 86 +++++++++++++++++++----------------------------------------
> > >  1 file changed, 28 insertions(+), 58 deletions(-)
> > > 
> > > diff --git a/udp.c b/udp.c
> > > index 7c601cc..6ccfe8c 100644
> > > --- a/udp.c
> > > +++ b/udp.c
> > > @@ -590,52 +590,6 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
> > >  	sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL);
> > >  }
> > >  
> > > -/**
> > > - * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection
> > > - * @c:		Execution context
> > > - * @ref:	epoll reference
> > > - * @events:	epoll events bitmap
> > > - * @now:	Current timestamp
> > > - */
> > > -static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref,
> > > -				    uint32_t events, const struct timespec *now)
> > > -{
> > > -	in_port_t dst = ref.r.p.udp.udp.port;
> > > -	int v6 = ref.r.p.udp.udp.v6, n, i, m;
> > > -	struct mmsghdr *mmh_recv;
> > > -
> > > -	if (!(events & EPOLLIN))
> > > -		return;
> > > -
> > > -	if (v6)
> > > -		mmh_recv = udp6_l2_mh_sock;
> > > -	else
> > > -		mmh_recv = udp4_l2_mh_sock;
> > > -
> > > -	n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
> > > -
> > > -	if (n <= 0)
> > > -		return;
> > > -
> > > -	if (v6)
> > > -		udp_localname6.sin6_port = htons(dst);
> > > -	else
> > > -		udp_localname4.sin_port = htons(dst);
> > > -
> > > -	for (i = 0; i < n; i += m) {
> > > -		in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
> > > -
> > > -		for (m = 1; i + m < n; m++) {
> > > -			void *mname = mmh_recv[i + m].msg_hdr.msg_name;
> > > -			if (sa_port(v6, mname) != src)
> > > -				break;
> > > -		}
> > > -
> > > -		udp_splice_sendfrom(c, i, m, src, dst, v6, ref.r.p.udp.udp.ns,
> > > -				    ref.r.p.udp.udp.orig, now);
> > > -	}
> > > -}
> > > -
> > >  /**
> > >   * udp_update_hdr4() - Update headers for one IPv4 datagram
> > >   * @c:		Execution context
> > > @@ -945,27 +899,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
> > >  {
> > >  	in_port_t dstport = ref.r.p.udp.udp.port;
> > >  	bool v6 = ref.r.p.udp.udp.v6;
> > > -	struct mmsghdr *sock_mmh;
> > > +	struct mmsghdr *mmh_recv;
> > > +	unsigned int i, m;
> > >  	ssize_t n;
> > >  
> > > -	if (events == EPOLLERR)
> > > +	if (!(events & EPOLLIN))  
> > 
> > Pre-existing, unrelated issue, but this reminds me: we don't handle
> > socket errors here, and while udp_timer_one() will drop any sockets we
> > created, eventually, it would probably be better to act right away.  
> 
> Ok... I'm not sure what, if anything, you would like me to do about
> it, however.

No no, sorry, nothing, I wanted to share in case you happen to touch
this again soon, or if you had thoughts about it. I can fix this
separately once you're done with changes for UDP.

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
  2022-12-14  1:42     ` David Gibson
@ 2022-12-14 10:35       ` Stefano Brivio
  2022-12-20 10:42         ` Stefano Brivio
  0 siblings, 1 reply; 28+ messages in thread
From: Stefano Brivio @ 2022-12-14 10:35 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Wed, 14 Dec 2022 12:42:14 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Tue, Dec 13, 2022 at 11:48:47PM +0100, Stefano Brivio wrote:
> > Sorry for the long delay here,
> > 
> > On Mon,  5 Dec 2022 19:14:21 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > Usually udp_sock_handler() will receive and forward multiple (up to 32)
> > > datagrams in udp_sock_handler(), then forward them all to the tap
> > > interface.  For unclear reasons, though, when in pasta mode we will only
> > > receive and forward a single datagram at a time.  Change it to receive
> > > multiple datagrams at once, like the other paths.  
> > 
> > This is explained in the commit message of 6c931118643c ("tcp, udp:
> > Receive batching doesn't pay off when writing single frames to tap").
> > 
> > I think it's worth re-checking the throughput now as this path is a bit
> > different, but unfortunately I didn't include this in the "perf" tests :(
> > because at the time I introduced those I wasn't sure it even made sense to
> > have traffic from the same host being directed to the tap device.
> > 
> > The iperf3 runs were I observed this are actually the ones from the Podman
> > demo. Ideally that case should be also checked in the perf/pasta_udp tests.  
> 
> Hm, ok.
> 
> > How fundamental is this for the rest of the series? I couldn't find any
> > actual dependency on this but I might be missing something.  
> 
> So the issue is that prior to this change in pasta we receive multiple
> frames at once on the splice path, but one frame at a time on the tap
> path.  By the end of this series we can't do that any more, because we
> don't know before the recvmmsg() which one we'll be doing.

Oh, right, I see. Then let me add this path to the perf/pasta_udp test
and check how relevant this is now, I'll get back to you in a bit.

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/8] udp: Decide whether to "splice" per datagram rather than per socket
  2022-12-14  1:47     ` David Gibson
@ 2022-12-14 10:35       ` Stefano Brivio
  2022-12-15  0:33         ` David Gibson
  0 siblings, 1 reply; 28+ messages in thread
From: Stefano Brivio @ 2022-12-14 10:35 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Wed, 14 Dec 2022 12:47:25 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Tue, Dec 13, 2022 at 11:49:18PM +0100, Stefano Brivio wrote:
> > On Mon,  5 Dec 2022 19:14:24 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> >   
> > > Currently we have special sockets for receiving datagrams from locahost
> > > which can use the optimized "splice" path rather than going across the tap
> > > interface.
> > > 
> > > We want to loosen this so that sockets can receive sockets that will be
> > > forwarded by both the spliced and non-spliced paths.  To do this, we alter
> > > the meaning of the @splice bit in the reference to mean that packets
> > > receieved on this socket *can* be spliced, not that they *will* be spliced.
> > > They'll only actually be spliced if they come from 127.0.0.1 or ::1.
> > > 
> > > We can't (for now) remove the splice bit entirely, unlike with TCP.  Our
> > > gateway mapping means that if the ns initiates communication to the gw
> > > address, we'll translate that to target 127.0.0.1 on the host side.  Reply
> > > packets will therefore have source address 127.0.0.1 when received on the
> > > host, but these need to go via the tap path where that will be translated
> > > back to the gateway address.  We need the @splice bit to distinguish that
> > > case from packets going from localhost to a port mapped explicitly with
> > > -u which should be spliced.
> > > 
> > > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > > ---
> > >  udp.c | 54 +++++++++++++++++++++++++++++++++++-------------------
> > >  udp.h |  2 +-
> > >  2 files changed, 36 insertions(+), 20 deletions(-)
> > > 
> > > diff --git a/udp.c b/udp.c
> > > index 6ccfe8c..011a157 100644
> > > --- a/udp.c
> > > +++ b/udp.c
> > > @@ -513,16 +513,27 @@ static int udp_splice_new_ns(void *arg)
> > >  }
> > >  
> > >  /**
> > > - * sa_port() - Determine port from a sockaddr_in or sockaddr_in6
> > > + * udp_mmh_splice_port() - Is source address of message suitable for splicing?
> > >   * @v6:		Is @sa a sockaddr_in6 (otherwise sockaddr_in)?
> > > - * @sa:		Pointer to either sockaddr_in or sockaddr_in6
> > > + * @mmh:	mmsghdr of incoming message
> > > + *
> > > + * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from
> > > + *         @sa, otherwise 0.
> > > + *
> > > + * NOTE: this relies on the fact that it's not valid to use UDP port 0  
> > 
> > The port is reserved by IANA indeed, but... it can actually be used. On
> > Linux, you can bind() it and you can connect() to it. As far as I can
> > tell from the new version of udp_sock_handler() we would actually
> > misdirect packets in that case.  
> 
> Hm, ok.  Given the IANA reservation, I think it would be acceptable to
> simply drop such packets - but if we were to make that choice we
> should do so explicitly, rather than misdirecting them.

Acceptable, sure, but... I don't know, it somehow doesn't look
desirable to me. The kernel doesn't enforce this, so I guess we
shouldn't either.

> > How bad would it be to use an int here?  
> 
> Pretty straightforward.  Just means we have to use the somewhat
> abtruse "if (port <= USHRT_MAX)" or "if (port >= 0)" or something
> instead of just "if (port)".  Should I go ahead and make that change?

Eh, I don't like it either, but... I guess it's better than the
alternative, so yes, thanks. Or pass port as a pointer, set on return.
I'm fine with both.

> > By the way, I think the comment should also mention that the port is
> > returned in host order.  
> 
> Ok, easily done.  Generally I try to keep the endianness associated
> with the type, rather than attempting to document it for each variable
> (or even worse, each point in time for each variable).

Yes, I see, and it's a more valid approach than mine, but still mine
comes almost for free.

By the way, I got distracted by this and I forgot about two things:

> +static in_port_t udp_mmh_splice_port(bool v6, const struct mmsghdr *mmh)
>  {
> -	const struct sockaddr_in6 *sa6 = sa;
> -	const struct sockaddr_in *sa4 = sa;
> +	const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name;
> +	const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name;;

Stray semicolon here.

> +
> +	if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
> +		return ntohs(sa6->sin6_port);
>  
> -	return v6 ? ntohs(sa6->sin6_port) : ntohs(sa4->sin_port);
> +	if (ntohl(sa4->sin_addr.s_addr) == INADDR_LOOPBACK)
> +		return ntohs(sa4->sin_port);

If it's IPv6, but not a loopback address, we'll check if
sa4->sin_addr.s_addr == INADDR_LOOPBACK -- which might actually be true for
an IPv6, non-loopback address.

Also, I think we can happily "splice" for any loopback address, not
just 127.0.0.1. What about something like:

	if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
		return ntohs(sa6->sin6_port);

	if (!v4 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr))
		return ntohs(sa4->sin_port);

	return -1;

?

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 7/8] udp: Decide whether to "splice" per datagram rather than per socket
  2022-12-14 10:35       ` Stefano Brivio
@ 2022-12-15  0:33         ` David Gibson
  0 siblings, 0 replies; 28+ messages in thread
From: David Gibson @ 2022-12-15  0:33 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 6209 bytes --]

On Wed, Dec 14, 2022 at 11:35:54AM +0100, Stefano Brivio wrote:
> On Wed, 14 Dec 2022 12:47:25 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Tue, Dec 13, 2022 at 11:49:18PM +0100, Stefano Brivio wrote:
> > > On Mon,  5 Dec 2022 19:14:24 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > > > Currently we have special sockets for receiving datagrams from locahost
> > > > which can use the optimized "splice" path rather than going across the tap
> > > > interface.
> > > > 
> > > > We want to loosen this so that sockets can receive sockets that will be
> > > > forwarded by both the spliced and non-spliced paths.  To do this, we alter
> > > > the meaning of the @splice bit in the reference to mean that packets
> > > > receieved on this socket *can* be spliced, not that they *will* be spliced.
> > > > They'll only actually be spliced if they come from 127.0.0.1 or ::1.
> > > > 
> > > > We can't (for now) remove the splice bit entirely, unlike with TCP.  Our
> > > > gateway mapping means that if the ns initiates communication to the gw
> > > > address, we'll translate that to target 127.0.0.1 on the host side.  Reply
> > > > packets will therefore have source address 127.0.0.1 when received on the
> > > > host, but these need to go via the tap path where that will be translated
> > > > back to the gateway address.  We need the @splice bit to distinguish that
> > > > case from packets going from localhost to a port mapped explicitly with
> > > > -u which should be spliced.
> > > > 
> > > > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > > > ---
> > > >  udp.c | 54 +++++++++++++++++++++++++++++++++++-------------------
> > > >  udp.h |  2 +-
> > > >  2 files changed, 36 insertions(+), 20 deletions(-)
> > > > 
> > > > diff --git a/udp.c b/udp.c
> > > > index 6ccfe8c..011a157 100644
> > > > --- a/udp.c
> > > > +++ b/udp.c
> > > > @@ -513,16 +513,27 @@ static int udp_splice_new_ns(void *arg)
> > > >  }
> > > >  
> > > >  /**
> > > > - * sa_port() - Determine port from a sockaddr_in or sockaddr_in6
> > > > + * udp_mmh_splice_port() - Is source address of message suitable for splicing?
> > > >   * @v6:		Is @sa a sockaddr_in6 (otherwise sockaddr_in)?
> > > > - * @sa:		Pointer to either sockaddr_in or sockaddr_in6
> > > > + * @mmh:	mmsghdr of incoming message
> > > > + *
> > > > + * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from
> > > > + *         @sa, otherwise 0.
> > > > + *
> > > > + * NOTE: this relies on the fact that it's not valid to use UDP port 0  
> > > 
> > > The port is reserved by IANA indeed, but... it can actually be used. On
> > > Linux, you can bind() it and you can connect() to it. As far as I can
> > > tell from the new version of udp_sock_handler() we would actually
> > > misdirect packets in that case.  
> > 
> > Hm, ok.  Given the IANA reservation, I think it would be acceptable to
> > simply drop such packets - but if we were to make that choice we
> > should do so explicitly, rather than misdirecting them.
> 
> Acceptable, sure, but... I don't know, it somehow doesn't look
> desirable to me. The kernel doesn't enforce this, so I guess we
> shouldn't either.
> 
> > > How bad would it be to use an int here?  
> > 
> > Pretty straightforward.  Just means we have to use the somewhat
> > abtruse "if (port <= USHRT_MAX)" or "if (port >= 0)" or something
> > instead of just "if (port)".  Should I go ahead and make that change?
> 
> Eh, I don't like it either, but... I guess it's better than the
> alternative, so yes, thanks. Or pass port as a pointer, set on return.
> I'm fine with both.

I think the int is less ugly than the pointer.  Ok, I'll make that change.

> > > By the way, I think the comment should also mention that the port is
> > > returned in host order.  
> > 
> > Ok, easily done.  Generally I try to keep the endianness associated
> > with the type, rather than attempting to document it for each variable
> > (or even worse, each point in time for each variable).
> 
> Yes, I see, and it's a more valid approach than mine, but still mine
> comes almost for free.

Right.  Actually rereading the function in question, it specifically
says "port from @sa" and in the sockaddr, of course, it's network
endian, so it could particularly do with clarification in this case.

> By the way, I got distracted by this and I forgot about two things:
> 
> > +static in_port_t udp_mmh_splice_port(bool v6, const struct mmsghdr *mmh)
> >  {
> > -	const struct sockaddr_in6 *sa6 = sa;
> > -	const struct sockaddr_in *sa4 = sa;
> > +	const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name;
> > +	const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name;;
> 
> Stray semicolon here.

Fixed.

> > +
> > +	if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
> > +		return ntohs(sa6->sin6_port);
> >  
> > -	return v6 ? ntohs(sa6->sin6_port) : ntohs(sa4->sin_port);
> > +	if (ntohl(sa4->sin_addr.s_addr) == INADDR_LOOPBACK)
> > +		return ntohs(sa4->sin_port);
> 
> If it's IPv6, but not a loopback address, we'll check if
> sa4->sin_addr.s_addr == INADDR_LOOPBACK -- which might actually be true for
> an IPv6, non-loopback address.

Oops, yes, that's definitely wrong, and wrong enough to respin.

> Also, I think we can happily "splice" for any loopback address, not
> just 127.0.0.1. What about something like:

Well.. yes and no.  Yes, we can deliver packets in this way, but we'll
lost track of the original from address, so reply packets will be
misdirected.

Hrm.. ISTR you mentioned some cases that worked despite that, so I'll
make the change, and hope to fix it up better when I get to the NAT /
splice semantics rework.

> 	if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
> 		return ntohs(sa6->sin6_port);
> 
> 	if (!v4 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr))
> 		return ntohs(sa4->sin_port);
> 
> 	return -1;
> 
> ?
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
  2022-12-14 10:35       ` Stefano Brivio
@ 2022-12-20 10:42         ` Stefano Brivio
  2022-12-21  6:00           ` David Gibson
  0 siblings, 1 reply; 28+ messages in thread
From: Stefano Brivio @ 2022-12-20 10:42 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

Sorry for the further delay,

On Wed, 14 Dec 2022 11:35:46 +0100
Stefano Brivio <sbrivio@redhat.com> wrote:

> On Wed, 14 Dec 2022 12:42:14 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Tue, Dec 13, 2022 at 11:48:47PM +0100, Stefano Brivio wrote:  
> > > Sorry for the long delay here,
> > > 
> > > On Mon,  5 Dec 2022 19:14:21 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >     
> > > > Usually udp_sock_handler() will receive and forward multiple (up to 32)
> > > > datagrams in udp_sock_handler(), then forward them all to the tap
> > > > interface.  For unclear reasons, though, when in pasta mode we will only
> > > > receive and forward a single datagram at a time.  Change it to receive
> > > > multiple datagrams at once, like the other paths.    
> > > 
> > > This is explained in the commit message of 6c931118643c ("tcp, udp:
> > > Receive batching doesn't pay off when writing single frames to tap").
> > > 
> > > I think it's worth re-checking the throughput now as this path is a bit
> > > different, but unfortunately I didn't include this in the "perf" tests :(
> > > because at the time I introduced those I wasn't sure it even made sense to
> > > have traffic from the same host being directed to the tap device.
> > > 
> > > The iperf3 runs were I observed this are actually the ones from the Podman
> > > demo. Ideally that case should be also checked in the perf/pasta_udp tests.    
> > 
> > Hm, ok.
> >   
> > > How fundamental is this for the rest of the series? I couldn't find any
> > > actual dependency on this but I might be missing something.    
> > 
> > So the issue is that prior to this change in pasta we receive multiple
> > frames at once on the splice path, but one frame at a time on the tap
> > path.  By the end of this series we can't do that any more, because we
> > don't know before the recvmmsg() which one we'll be doing.  
> 
> Oh, right, I see. Then let me add this path to the perf/pasta_udp test
> and check how relevant this is now, I'll get back to you in a bit.

I was checking the wrong path. With this:

--
diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp
index 27ea724..973c2f4 100644
--- a/test/perf/pasta_udp
+++ b/test/perf/pasta_udp
@@ -31,6 +31,14 @@ report	pasta lo_udp 1 __FREQ__
 
 th	MTU 1500B 4000B 16384B 65535B
 
+tr	UDP throughput over IPv6: host to ns
+nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
+nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
+bw	-
+bw	-
+bw	-
+iperf3	BW host ns __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 15G
+bw	__BW__ 7.0 9.0
 
 tr	UDP throughput over IPv6: ns to host
 ns	ip link set dev lo mtu 1500
diff --git a/test/run b/test/run
index e07513f..b53182b 100755
--- a/test/run
+++ b/test/run
@@ -67,6 +67,14 @@ run() {
 	test build/clang_tidy
 	teardown build
 
+	VALGRIND=0
+	setup passt_in_ns
+	test passt/ndp
+	test passt/dhcp
+	test perf/pasta_udp
+	test passt_in_ns/shutdown
+	teardown passt_in_ns
+
 	setup pasta
 	test pasta/ndp
 	test pasta/dhcp
--

I get 21.6 gbps after this series, and 29.7 gbps before -- it's quite
significant.

And there's nothing strange in perf's output, really, the distribution
of overhead per functions is pretty much the same, but writing multiple
messages to the tap device just takes more cycles per message compared
to a single message.

I'm a bit ashamed to propose this, but do you think about something
like:

	if (c->mode == MODE_PASTA) {
		if (recvmmsg(ref.r.s, mmh_recv, 1, 0, NULL) <= 0)
			return;

		if (udp_mmh_splice_port(v6, mmh_recv)) {
			n = recvmmsg(ref.r.s, mmh_recv + 1,
				     UDP_MAX_FRAMES - 1, 0, NULL);
		}

		if (n > 0)
			n++;
		else
			n = 1;
	} else {
		n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
		if (n <= 0)
			return;
	}

? Other than the inherent ugliness, it looks like a good approximation
to me.

-- 
@@ -67,6 +67,14 @@ run() {
 	test build/clang_tidy
 	teardown build
 
+	VALGRIND=0
+	setup passt_in_ns
+	test passt/ndp
+	test passt/dhcp
+	test perf/pasta_udp
+	test passt_in_ns/shutdown
+	teardown passt_in_ns
+
 	setup pasta
 	test pasta/ndp
 	test pasta/dhcp
--

I get 21.6 gbps after this series, and 29.7 gbps before -- it's quite
significant.

And there's nothing strange in perf's output, really, the distribution
of overhead per functions is pretty much the same, but writing multiple
messages to the tap device just takes more cycles per message compared
to a single message.

I'm a bit ashamed to propose this, but do you think about something
like:

	if (c->mode == MODE_PASTA) {
		if (recvmmsg(ref.r.s, mmh_recv, 1, 0, NULL) <= 0)
			return;

		if (udp_mmh_splice_port(v6, mmh_recv)) {
			n = recvmmsg(ref.r.s, mmh_recv + 1,
				     UDP_MAX_FRAMES - 1, 0, NULL);
		}

		if (n > 0)
			n++;
		else
			n = 1;
	} else {
		n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
		if (n <= 0)
			return;
	}

? Other than the inherent ugliness, it looks like a good approximation
to me.

-- 
Stefano


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
  2022-12-20 10:42         ` Stefano Brivio
@ 2022-12-21  6:00           ` David Gibson
  2022-12-22  2:37             ` Stefano Brivio
  2023-01-04  0:08             ` Stefano Brivio
  0 siblings, 2 replies; 28+ messages in thread
From: David Gibson @ 2022-12-21  6:00 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 5249 bytes --]

On Tue, Dec 20, 2022 at 11:42:46AM +0100, Stefano Brivio wrote:
> Sorry for the further delay,
> 
> On Wed, 14 Dec 2022 11:35:46 +0100
> Stefano Brivio <sbrivio@redhat.com> wrote:
> 
> > On Wed, 14 Dec 2022 12:42:14 +1100
> > David Gibson <david@gibson.dropbear.id.au> wrote:
> > 
> > > On Tue, Dec 13, 2022 at 11:48:47PM +0100, Stefano Brivio wrote:  
> > > > Sorry for the long delay here,
> > > > 
> > > > On Mon,  5 Dec 2022 19:14:21 +1100
> > > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > > >     
> > > > > Usually udp_sock_handler() will receive and forward multiple (up to 32)
> > > > > datagrams in udp_sock_handler(), then forward them all to the tap
> > > > > interface.  For unclear reasons, though, when in pasta mode we will only
> > > > > receive and forward a single datagram at a time.  Change it to receive
> > > > > multiple datagrams at once, like the other paths.    
> > > > 
> > > > This is explained in the commit message of 6c931118643c ("tcp, udp:
> > > > Receive batching doesn't pay off when writing single frames to tap").
> > > > 
> > > > I think it's worth re-checking the throughput now as this path is a bit
> > > > different, but unfortunately I didn't include this in the "perf" tests :(
> > > > because at the time I introduced those I wasn't sure it even made sense to
> > > > have traffic from the same host being directed to the tap device.
> > > > 
> > > > The iperf3 runs were I observed this are actually the ones from the Podman
> > > > demo. Ideally that case should be also checked in the perf/pasta_udp tests.    
> > > 
> > > Hm, ok.
> > >   
> > > > How fundamental is this for the rest of the series? I couldn't find any
> > > > actual dependency on this but I might be missing something.    
> > > 
> > > So the issue is that prior to this change in pasta we receive multiple
> > > frames at once on the splice path, but one frame at a time on the tap
> > > path.  By the end of this series we can't do that any more, because we
> > > don't know before the recvmmsg() which one we'll be doing.  
> > 
> > Oh, right, I see. Then let me add this path to the perf/pasta_udp test
> > and check how relevant this is now, I'll get back to you in a bit.
> 
> I was checking the wrong path. With this:
> 
> diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp
> index 27ea724..973c2f4 100644
> --- a/test/perf/pasta_udp
> +++ b/test/perf/pasta_udp
> @@ -31,6 +31,14 @@ report	pasta lo_udp 1 __FREQ__
>  
>  th	MTU 1500B 4000B 16384B 65535B
>  
> +tr	UDP throughput over IPv6: host to ns
> +nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
> +nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
> +bw	-
> +bw	-
> +bw	-
> +iperf3	BW host ns __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 15G
> +bw	__BW__ 7.0 9.0
>  
>  tr	UDP throughput over IPv6: ns to host
>  ns	ip link set dev lo mtu 1500
> diff --git a/test/run b/test/run
> index e07513f..b53182b 100755
> --- a/test/run
> +++ b/test/run
> @@ -67,6 +67,14 @@ run() {
>  	test build/clang_tidy
>  	teardown build
>  
> +	VALGRIND=0
> +	setup passt_in_ns
> +	test passt/ndp
> +	test passt/dhcp
> +	test perf/pasta_udp
> +	test passt_in_ns/shutdown
> +	teardown passt_in_ns
> +
>  	setup pasta
>  	test pasta/ndp
>  	test pasta/dhcp

Ah, ok.  Can we add that to the standard set of tests ASAP, please.

> I get 21.6 gbps after this series, and 29.7 gbps before -- it's quite
> significant.

Drat.

> And there's nothing strange in perf's output, really, the distribution
> of overhead per functions is pretty much the same, but writing multiple
> messages to the tap device just takes more cycles per message compared
> to a single message.

That's so weird.  It should be basically an identical set of write()s,
except that they happen in a batch, rather than a bit spread out.  I
guess it has to be some kind of cache locality thing.  I wonder if the
difference would go away or reverse if we had a way to submit multiple
frames with a single syscall.

> I'm a bit ashamed to propose this, but do you think about something
> like:

> 	if (c->mode == MODE_PASTA) { if (recvmmsg(ref.r.s, mmh_recv,
> 		1, 0, NULL) <= 0) return;

> 		if (udp_mmh_splice_port(v6, mmh_recv)) { n =
> 			recvmmsg(ref.r.s, mmh_recv + 1, UDP_MAX_FRAMES
> 			- 1, 0, NULL); }

> 		if (n > 0) n++; else n = 1; } else { n =
> 			recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0,
> 			NULL); if (n <= 0) return; }

> ? Other than the inherent ugliness, it looks like a good
> approximation to me.

Hmm.  Well, the first question is how much impact does going 1 message
at a time have on the spliced throughput.  If it's not too bad, then
we could just always go one at a time for pasta, regardless of
splicing.  And we could even abstract that difference into the tap
backend with a callback like tap_batch_size(c).

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
  2022-12-21  6:00           ` David Gibson
@ 2022-12-22  2:37             ` Stefano Brivio
  2023-01-04  0:08             ` Stefano Brivio
  1 sibling, 0 replies; 28+ messages in thread
From: Stefano Brivio @ 2022-12-22  2:37 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Wed, 21 Dec 2022 17:00:24 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Tue, Dec 20, 2022 at 11:42:46AM +0100, Stefano Brivio wrote:
> > Sorry for the further delay,
> > 
> > On Wed, 14 Dec 2022 11:35:46 +0100
> > Stefano Brivio <sbrivio@redhat.com> wrote:
> >   
> > > On Wed, 14 Dec 2022 12:42:14 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > > > On Tue, Dec 13, 2022 at 11:48:47PM +0100, Stefano Brivio wrote:    
> > > > > Sorry for the long delay here,
> > > > > 
> > > > > On Mon,  5 Dec 2022 19:14:21 +1100
> > > > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > > > >       
> > > > > > Usually udp_sock_handler() will receive and forward multiple (up to 32)
> > > > > > datagrams in udp_sock_handler(), then forward them all to the tap
> > > > > > interface.  For unclear reasons, though, when in pasta mode we will only
> > > > > > receive and forward a single datagram at a time.  Change it to receive
> > > > > > multiple datagrams at once, like the other paths.      
> > > > > 
> > > > > This is explained in the commit message of 6c931118643c ("tcp, udp:
> > > > > Receive batching doesn't pay off when writing single frames to tap").
> > > > > 
> > > > > I think it's worth re-checking the throughput now as this path is a bit
> > > > > different, but unfortunately I didn't include this in the "perf" tests :(
> > > > > because at the time I introduced those I wasn't sure it even made sense to
> > > > > have traffic from the same host being directed to the tap device.
> > > > > 
> > > > > The iperf3 runs were I observed this are actually the ones from the Podman
> > > > > demo. Ideally that case should be also checked in the perf/pasta_udp tests.      
> > > > 
> > > > Hm, ok.
> > > >     
> > > > > How fundamental is this for the rest of the series? I couldn't find any
> > > > > actual dependency on this but I might be missing something.      
> > > > 
> > > > So the issue is that prior to this change in pasta we receive multiple
> > > > frames at once on the splice path, but one frame at a time on the tap
> > > > path.  By the end of this series we can't do that any more, because we
> > > > don't know before the recvmmsg() which one we'll be doing.    
> > > 
> > > Oh, right, I see. Then let me add this path to the perf/pasta_udp test
> > > and check how relevant this is now, I'll get back to you in a bit.  
> > 
> > I was checking the wrong path. With this:
> > 
> > diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp
> > index 27ea724..973c2f4 100644
> > --- a/test/perf/pasta_udp
> > +++ b/test/perf/pasta_udp
> > @@ -31,6 +31,14 @@ report	pasta lo_udp 1 __FREQ__
> >  
> >  th	MTU 1500B 4000B 16384B 65535B
> >  
> > +tr	UDP throughput over IPv6: host to ns
> > +nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
> > +nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
> > +bw	-
> > +bw	-
> > +bw	-
> > +iperf3	BW host ns __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 15G
> > +bw	__BW__ 7.0 9.0
> >  
> >  tr	UDP throughput over IPv6: ns to host
> >  ns	ip link set dev lo mtu 1500
> > diff --git a/test/run b/test/run
> > index e07513f..b53182b 100755
> > --- a/test/run
> > +++ b/test/run
> > @@ -67,6 +67,14 @@ run() {
> >  	test build/clang_tidy
> >  	teardown build
> >  
> > +	VALGRIND=0
> > +	setup passt_in_ns
> > +	test passt/ndp
> > +	test passt/dhcp
> > +	test perf/pasta_udp
> > +	test passt_in_ns/shutdown
> > +	teardown passt_in_ns
> > +
> >  	setup pasta
> >  	test pasta/ndp
> >  	test pasta/dhcp  
> 
> Ah, ok.  Can we add that to the standard set of tests ASAP, please.

Yes -- that part itself was easy, but now I'm fighting against my own
finest write-only code that generates the JavaScript snippet for the
performance report (perf_fill_lines() in test/lib/perf_report -- and
this is not a suggestion to have a look at it ;)).

I'm trying to rework it a bit together with the "new" test.

> > I get 21.6 gbps after this series, and 29.7 gbps before -- it's quite
> > significant.  
> 
> Drat.
> 
> > And there's nothing strange in perf's output, really, the distribution
> > of overhead per functions is pretty much the same, but writing multiple
> > messages to the tap device just takes more cycles per message compared
> > to a single message.  
> 
> That's so weird.  It should be basically an identical set of write()s,
> except that they happen in a batch, rather than a bit spread out.  I
> guess it has to be some kind of cache locality thing.  I wonder if the
> difference would go away or reverse if we had a way to submit multiple
> frames with a single syscall.

I haven't tried, but to test this, I think we could actually just write
multiple frames in a single call, with subsequent headers and
everything, and the iperf3 server will simply report how many bytes it
received.

> > I'm a bit ashamed to propose this, but do you think about something
> > like:  
> 
> > 	if (c->mode == MODE_PASTA) { if (recvmmsg(ref.r.s, mmh_recv,
> > 		1, 0, NULL) <= 0) return;  
> 
> > 		if (udp_mmh_splice_port(v6, mmh_recv)) { n =
> > 			recvmmsg(ref.r.s, mmh_recv + 1, UDP_MAX_FRAMES
> > 			- 1, 0, NULL); }  
> 
> > 		if (n > 0) n++; else n = 1; } else { n =
> > 			recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0,
> > 			NULL); if (n <= 0) return; }  
> 
> > ? Other than the inherent ugliness, it looks like a good
> > approximation to me.  
> 
> Hmm.  Well, the first question is how much impact does going 1 message
> at a time have on the spliced throughput.  If it's not too bad, then
> we could just always go one at a time for pasta, regardless of
> splicing.  And we could even abstract that difference into the tap
> backend with a callback like tap_batch_size(c).

Right... it used to be significantly worse in the "spliced" case, I
checked that when I did that commit to use 1 instead of UDP_MAX_FRAME
in the other case, but I don't have data. I'll test this again.

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
  2022-12-21  6:00           ` David Gibson
  2022-12-22  2:37             ` Stefano Brivio
@ 2023-01-04  0:08             ` Stefano Brivio
  2023-01-04  4:53               ` David Gibson
  1 sibling, 1 reply; 28+ messages in thread
From: Stefano Brivio @ 2023-01-04  0:08 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Wed, 21 Dec 2022 17:00:24 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Tue, Dec 20, 2022 at 11:42:46AM +0100, Stefano Brivio wrote:
> > Sorry for the further delay,
> > 
> > On Wed, 14 Dec 2022 11:35:46 +0100
> > Stefano Brivio <sbrivio@redhat.com> wrote:
> >   
> > > On Wed, 14 Dec 2022 12:42:14 +1100
> > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > >   
> > > > On Tue, Dec 13, 2022 at 11:48:47PM +0100, Stefano Brivio wrote:    
> > > > > Sorry for the long delay here,
> > > > > 
> > > > > On Mon,  5 Dec 2022 19:14:21 +1100
> > > > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > > > >       
> > > > > > Usually udp_sock_handler() will receive and forward multiple (up to 32)
> > > > > > datagrams in udp_sock_handler(), then forward them all to the tap
> > > > > > interface.  For unclear reasons, though, when in pasta mode we will only
> > > > > > receive and forward a single datagram at a time.  Change it to receive
> > > > > > multiple datagrams at once, like the other paths.      
> > > > > 
> > > > > This is explained in the commit message of 6c931118643c ("tcp, udp:
> > > > > Receive batching doesn't pay off when writing single frames to tap").
> > > > > 
> > > > > I think it's worth re-checking the throughput now as this path is a bit
> > > > > different, but unfortunately I didn't include this in the "perf" tests :(
> > > > > because at the time I introduced those I wasn't sure it even made sense to
> > > > > have traffic from the same host being directed to the tap device.
> > > > > 
> > > > > The iperf3 runs were I observed this are actually the ones from the Podman
> > > > > demo. Ideally that case should be also checked in the perf/pasta_udp tests.      
> > > > 
> > > > Hm, ok.
> > > >     
> > > > > How fundamental is this for the rest of the series? I couldn't find any
> > > > > actual dependency on this but I might be missing something.      
> > > > 
> > > > So the issue is that prior to this change in pasta we receive multiple
> > > > frames at once on the splice path, but one frame at a time on the tap
> > > > path.  By the end of this series we can't do that any more, because we
> > > > don't know before the recvmmsg() which one we'll be doing.    
> > > 
> > > Oh, right, I see. Then let me add this path to the perf/pasta_udp test
> > > and check how relevant this is now, I'll get back to you in a bit.  
> > 
> > I was checking the wrong path. With this:
> > 
> > diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp
> > index 27ea724..973c2f4 100644
> > --- a/test/perf/pasta_udp
> > +++ b/test/perf/pasta_udp
> > @@ -31,6 +31,14 @@ report	pasta lo_udp 1 __FREQ__
> >  
> >  th	MTU 1500B 4000B 16384B 65535B
> >  
> > +tr	UDP throughput over IPv6: host to ns
> > +nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
> > +nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
> > +bw	-
> > +bw	-
> > +bw	-
> > +iperf3	BW host ns __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 15G
> > +bw	__BW__ 7.0 9.0
> >  
> >  tr	UDP throughput over IPv6: ns to host
> >  ns	ip link set dev lo mtu 1500
> > diff --git a/test/run b/test/run
> > index e07513f..b53182b 100755
> > --- a/test/run
> > +++ b/test/run
> > @@ -67,6 +67,14 @@ run() {
> >  	test build/clang_tidy
> >  	teardown build
> >  
> > +	VALGRIND=0
> > +	setup passt_in_ns
> > +	test passt/ndp
> > +	test passt/dhcp
> > +	test perf/pasta_udp
> > +	test passt_in_ns/shutdown
> > +	teardown passt_in_ns
> > +
> >  	setup pasta
> >  	test pasta/ndp
> >  	test pasta/dhcp  
> 
> Ah, ok.  Can we add that to the standard set of tests ASAP, please.
> 
> > I get 21.6 gbps after this series, and 29.7 gbps before -- it's quite
> > significant.  
> 
> Drat.
> 
> > And there's nothing strange in perf's output, really, the distribution
> > of overhead per functions is pretty much the same, but writing multiple
> > messages to the tap device just takes more cycles per message compared
> > to a single message.  
> 
> That's so weird.  It should be basically an identical set of write()s,
> except that they happen in a batch, rather than a bit spread out.  I
> guess it has to be some kind of cache locality thing.  I wonder if the
> difference would go away or reverse if we had a way to submit multiple
> frames with a single syscall.
> 
> > I'm a bit ashamed to propose this, but do you think about something
> > like:  
> 
> > 	if (c->mode == MODE_PASTA) { if (recvmmsg(ref.r.s, mmh_recv,
> > 		1, 0, NULL) <= 0) return;  
> 
> > 		if (udp_mmh_splice_port(v6, mmh_recv)) { n =
> > 			recvmmsg(ref.r.s, mmh_recv + 1, UDP_MAX_FRAMES
> > 			- 1, 0, NULL); }  
> 
> > 		if (n > 0) n++; else n = 1; } else { n =
> > 			recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0,
> > 			NULL); if (n <= 0) return; }  
> 
> > ? Other than the inherent ugliness, it looks like a good
> > approximation to me.  
> 
> Hmm.  Well, the first question is how much impact does going 1 message
> at a time have on the spliced throughput.  If it's not too bad, then
> we could just always go one at a time for pasta, regardless of
> splicing.  And we could even abstract that difference into the tap
> backend with a callback like tap_batch_size(c).

So, finally I had the chance to try this out.

First off, baseline with the patch adding the new tests I just sent,
and the series you posted:

=== perf/pasta_udp
> pasta: throughput and latency (local traffic)
Throughput in Gbps, latency in µs, one thread at 3.6 GHz, 4 streams
                                                         MTU: |  1500B |  4000B | 16384B | 65535B |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv6: ns to host             |    4.4 |    8.5 |   19.5 |   23.0 |
             UDP RR latency over IPv6: ns to host             |      - |      - |      - |     27 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv4: ns to host             |    4.3 |    8.8 |   18.5 |   24.4 |
             UDP RR latency over IPv4: ns to host             |      - |      - |      - |     26 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv6: host to ns             |      - |      - |      - |   22.5 |
             UDP RR latency over IPv6: host to ns             |      - |      - |      - |     30 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv4: host to ns             |      - |      - |      - |   24.5 |
             UDP RR latency over IPv4: host to ns             |      - |      - |      - |     25 |
                                                              '--------'--------'--------'--------'
...passed.

> pasta: throughput and latency (traffic via tap)
Throughput in Gbps, latency in µs, one thread at 3.6 GHz, 4 streams
                                                         MTU: |  1500B |  4000B | 16384B | 65520B |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv6: ns to host             |    4.4 |   10.4 |   16.0 |   23.4 |
             UDP RR latency over IPv6: ns to host             |      - |      - |      - |     27 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv4: ns to host             |    5.2 |   10.8 |   16.0 |   24.0 |
             UDP RR latency over IPv4: ns to host             |      - |      - |      - |     28 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv6: host to ns             |      - |      - |      - |   21.5 |
             UDP RR latency over IPv6: host to ns             |      - |      - |      - |     29 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv4: host to ns             |      - |      - |      - |   26.3 |
             UDP RR latency over IPv4: host to ns             |      - |      - |      - |     26 |
                                                              '--------'--------'--------'--------'

which seems to indicate the whole "splicing" thing is pretty much
useless, for UDP (except for that 16 KiB MTU case, but I wonder how
relevant that is).

If I set UDP_MAX_FRAMES to 1, with a quick workaround for the resulting
warning in udp_tap_send() (single frame to send, hence single message),
it gets somewhat weird:

=== perf/pasta_udp
> pasta: throughput and latency (local traffic)
Throughput in Gbps, latency in µs, one thread at 3.6 GHz, 4 streams
                                                         MTU: |  1500B |  4000B | 16384B | 65535B |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv6: ns to host             |    3.4 |    7.0 |   21.6 |   31.6 |
             UDP RR latency over IPv6: ns to host             |      - |      - |      - |     30 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv4: ns to host             |    3.8 |    7.0 |   22.0 |   32.4 |
             UDP RR latency over IPv4: ns to host             |      - |      - |      - |     26 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv6: host to ns             |      - |      - |      - |   29.3 |
             UDP RR latency over IPv6: host to ns             |      - |      - |      - |     31 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv4: host to ns             |      - |      - |      - |   33.8 |
             UDP RR latency over IPv4: host to ns             |      - |      - |      - |     25 |
                                                              '--------'--------'--------'--------'
...passed.

> pasta: throughput and latency (traffic via tap)
Throughput in Gbps, latency in µs, one thread at 3.6 GHz, 4 streams
                                                         MTU: |  1500B |  4000B | 16384B | 65520B |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv6: ns to host             |    4.7 |   10.3 |   16.0 |   24.0 |
             UDP RR latency over IPv6: ns to host             |      - |      - |      - |     27 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv4: ns to host             |    5.6 |   11.4 |   16.0 |   24.0 |
             UDP RR latency over IPv4: ns to host             |      - |      - |      - |     26 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv6: host to ns             |      - |      - |      - |   21.5 |
             UDP RR latency over IPv6: host to ns             |      - |      - |      - |     29 |
                                                              |--------|--------|--------|--------|
             UDP throughput over IPv4: host to ns             |      - |      - |      - |   28.7 |
             UDP RR latency over IPv4: host to ns             |      - |      - |      - |     29 |
                                                              '--------'--------'--------'--------'

...except for the cases with low MTUs, throughput is significantly
higher if we read and send one message at a time on the "spliced" path.

Next, I would like to:

- bisect between 32 and 1 for UDP_MAX_FRAMES: maybe 32 affects data
  locality too much, but some lower value would still be beneficial by
  lowering syscall overhead

- try with sendmsg() instead of sendmmsg(), at this point. Looking at
  the kernel, that doesn't seem to make a real difference.

About this series: should we just go ahead and apply it with
UDP_MAX_FRAMES set to 1 for the moment being? It's anyway better than
the existing situation.

-- 
Stefano


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
  2023-01-04  0:08             ` Stefano Brivio
@ 2023-01-04  4:53               ` David Gibson
  0 siblings, 0 replies; 28+ messages in thread
From: David Gibson @ 2023-01-04  4:53 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 13889 bytes --]

On Wed, Jan 04, 2023 at 01:08:52AM +0100, Stefano Brivio wrote:
> On Wed, 21 Dec 2022 17:00:24 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Tue, Dec 20, 2022 at 11:42:46AM +0100, Stefano Brivio wrote:
> > > Sorry for the further delay,
> > > 
> > > On Wed, 14 Dec 2022 11:35:46 +0100
> > > Stefano Brivio <sbrivio@redhat.com> wrote:
> > >   
> > > > On Wed, 14 Dec 2022 12:42:14 +1100
> > > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > > >   
> > > > > On Tue, Dec 13, 2022 at 11:48:47PM +0100, Stefano Brivio wrote:    
> > > > > > Sorry for the long delay here,
> > > > > > 
> > > > > > On Mon,  5 Dec 2022 19:14:21 +1100
> > > > > > David Gibson <david@gibson.dropbear.id.au> wrote:
> > > > > >       
> > > > > > > Usually udp_sock_handler() will receive and forward multiple (up to 32)
> > > > > > > datagrams in udp_sock_handler(), then forward them all to the tap
> > > > > > > interface.  For unclear reasons, though, when in pasta mode we will only
> > > > > > > receive and forward a single datagram at a time.  Change it to receive
> > > > > > > multiple datagrams at once, like the other paths.      
> > > > > > 
> > > > > > This is explained in the commit message of 6c931118643c ("tcp, udp:
> > > > > > Receive batching doesn't pay off when writing single frames to tap").
> > > > > > 
> > > > > > I think it's worth re-checking the throughput now as this path is a bit
> > > > > > different, but unfortunately I didn't include this in the "perf" tests :(
> > > > > > because at the time I introduced those I wasn't sure it even made sense to
> > > > > > have traffic from the same host being directed to the tap device.
> > > > > > 
> > > > > > The iperf3 runs were I observed this are actually the ones from the Podman
> > > > > > demo. Ideally that case should be also checked in the perf/pasta_udp tests.      
> > > > > 
> > > > > Hm, ok.
> > > > >     
> > > > > > How fundamental is this for the rest of the series? I couldn't find any
> > > > > > actual dependency on this but I might be missing something.      
> > > > > 
> > > > > So the issue is that prior to this change in pasta we receive multiple
> > > > > frames at once on the splice path, but one frame at a time on the tap
> > > > > path.  By the end of this series we can't do that any more, because we
> > > > > don't know before the recvmmsg() which one we'll be doing.    
> > > > 
> > > > Oh, right, I see. Then let me add this path to the perf/pasta_udp test
> > > > and check how relevant this is now, I'll get back to you in a bit.  
> > > 
> > > I was checking the wrong path. With this:
> > > 
> > > diff --git a/test/perf/pasta_udp b/test/perf/pasta_udp
> > > index 27ea724..973c2f4 100644
> > > --- a/test/perf/pasta_udp
> > > +++ b/test/perf/pasta_udp
> > > @@ -31,6 +31,14 @@ report	pasta lo_udp 1 __FREQ__
> > >  
> > >  th	MTU 1500B 4000B 16384B 65535B
> > >  
> > > +tr	UDP throughput over IPv6: host to ns
> > > +nsout	IFNAME ip -j link show | jq -rM '.[] | select(.link_type == "ether").ifname'
> > > +nsout	ADDR6 ip -j -6 addr show|jq -rM '.[] | select(.ifname == "__IFNAME__").addr_info[] | select(.scope == "global" and .prefixlen == 64).local'
> > > +bw	-
> > > +bw	-
> > > +bw	-
> > > +iperf3	BW host ns __ADDR6__ 100${i}2 __THREADS__ __TIME__ __OPTS__ -b 15G
> > > +bw	__BW__ 7.0 9.0
> > >  
> > >  tr	UDP throughput over IPv6: ns to host
> > >  ns	ip link set dev lo mtu 1500
> > > diff --git a/test/run b/test/run
> > > index e07513f..b53182b 100755
> > > --- a/test/run
> > > +++ b/test/run
> > > @@ -67,6 +67,14 @@ run() {
> > >  	test build/clang_tidy
> > >  	teardown build
> > >  
> > > +	VALGRIND=0
> > > +	setup passt_in_ns
> > > +	test passt/ndp
> > > +	test passt/dhcp
> > > +	test perf/pasta_udp
> > > +	test passt_in_ns/shutdown
> > > +	teardown passt_in_ns
> > > +
> > >  	setup pasta
> > >  	test pasta/ndp
> > >  	test pasta/dhcp  
> > 
> > Ah, ok.  Can we add that to the standard set of tests ASAP, please.
> > 
> > > I get 21.6 gbps after this series, and 29.7 gbps before -- it's quite
> > > significant.  
> > 
> > Drat.
> > 
> > > And there's nothing strange in perf's output, really, the distribution
> > > of overhead per functions is pretty much the same, but writing multiple
> > > messages to the tap device just takes more cycles per message compared
> > > to a single message.  
> > 
> > That's so weird.  It should be basically an identical set of write()s,
> > except that they happen in a batch, rather than a bit spread out.  I
> > guess it has to be some kind of cache locality thing.  I wonder if the
> > difference would go away or reverse if we had a way to submit multiple
> > frames with a single syscall.
> > 
> > > I'm a bit ashamed to propose this, but do you think about something
> > > like:  
> > 
> > > 	if (c->mode == MODE_PASTA) { if (recvmmsg(ref.r.s, mmh_recv,
> > > 		1, 0, NULL) <= 0) return;  
> > 
> > > 		if (udp_mmh_splice_port(v6, mmh_recv)) { n =
> > > 			recvmmsg(ref.r.s, mmh_recv + 1, UDP_MAX_FRAMES
> > > 			- 1, 0, NULL); }  
> > 
> > > 		if (n > 0) n++; else n = 1; } else { n =
> > > 			recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0,
> > > 			NULL); if (n <= 0) return; }  
> > 
> > > ? Other than the inherent ugliness, it looks like a good
> > > approximation to me.  
> > 
> > Hmm.  Well, the first question is how much impact does going 1 message
> > at a time have on the spliced throughput.  If it's not too bad, then
> > we could just always go one at a time for pasta, regardless of
> > splicing.  And we could even abstract that difference into the tap
> > backend with a callback like tap_batch_size(c).
> 
> So, finally I had the chance to try this out.
> 
> First off, baseline with the patch adding the new tests I just sent,
> and the series you posted:
> 
> === perf/pasta_udp
> > pasta: throughput and latency (local traffic)
> Throughput in Gbps, latency in µs, one thread at 3.6 GHz, 4 streams
>                                                          MTU: |  1500B |  4000B | 16384B | 65535B |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv6: ns to host             |    4.4 |    8.5 |   19.5 |   23.0 |
>              UDP RR latency over IPv6: ns to host             |      - |      - |      - |     27 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv4: ns to host             |    4.3 |    8.8 |   18.5 |   24.4 |
>              UDP RR latency over IPv4: ns to host             |      - |      - |      - |     26 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv6: host to ns             |      - |      - |      - |   22.5 |
>              UDP RR latency over IPv6: host to ns             |      - |      - |      - |     30 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv4: host to ns             |      - |      - |      - |   24.5 |
>              UDP RR latency over IPv4: host to ns             |      - |      - |      - |     25 |
>                                                               '--------'--------'--------'--------'
> ...passed.
> 
> > pasta: throughput and latency (traffic via tap)
> Throughput in Gbps, latency in µs, one thread at 3.6 GHz, 4 streams
>                                                          MTU: |  1500B |  4000B | 16384B | 65520B |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv6: ns to host             |    4.4 |   10.4 |   16.0 |   23.4 |
>              UDP RR latency over IPv6: ns to host             |      - |      - |      - |     27 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv4: ns to host             |    5.2 |   10.8 |   16.0 |   24.0 |
>              UDP RR latency over IPv4: ns to host             |      - |      - |      - |     28 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv6: host to ns             |      - |      - |      - |   21.5 |
>              UDP RR latency over IPv6: host to ns             |      - |      - |      - |     29 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv4: host to ns             |      - |      - |      - |   26.3 |
>              UDP RR latency over IPv4: host to ns             |      - |      - |      - |     26 |
>                                                               '--------'--------'--------'--------'
> 
> which seems to indicate the whole "splicing" thing is pretty much
> useless, for UDP (except for that 16 KiB MTU case, but I wonder how
> relevant that is).
> 
> If I set UDP_MAX_FRAMES to 1, with a quick workaround for the resulting
> warning in udp_tap_send() (single frame to send, hence single message),
> it gets somewhat weird:
> 
> === perf/pasta_udp
> > pasta: throughput and latency (local traffic)
> Throughput in Gbps, latency in µs, one thread at 3.6 GHz, 4 streams
>                                                          MTU: |  1500B |  4000B | 16384B | 65535B |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv6: ns to host             |    3.4 |    7.0 |   21.6 |   31.6 |
>              UDP RR latency over IPv6: ns to host             |      - |      - |      - |     30 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv4: ns to host             |    3.8 |    7.0 |   22.0 |   32.4 |
>              UDP RR latency over IPv4: ns to host             |      - |      - |      - |     26 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv6: host to ns             |      - |      - |      - |   29.3 |
>              UDP RR latency over IPv6: host to ns             |      - |      - |      - |     31 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv4: host to ns             |      - |      - |      - |   33.8 |
>              UDP RR latency over IPv4: host to ns             |      - |      - |      - |     25 |
>                                                               '--------'--------'--------'--------'
> ...passed.
> 
> > pasta: throughput and latency (traffic via tap)
> Throughput in Gbps, latency in µs, one thread at 3.6 GHz, 4 streams
>                                                          MTU: |  1500B |  4000B | 16384B | 65520B |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv6: ns to host             |    4.7 |   10.3 |   16.0 |   24.0 |
>              UDP RR latency over IPv6: ns to host             |      - |      - |      - |     27 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv4: ns to host             |    5.6 |   11.4 |   16.0 |   24.0 |
>              UDP RR latency over IPv4: ns to host             |      - |      - |      - |     26 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv6: host to ns             |      - |      - |      - |   21.5 |
>              UDP RR latency over IPv6: host to ns             |      - |      - |      - |     29 |
>                                                               |--------|--------|--------|--------|
>              UDP throughput over IPv4: host to ns             |      - |      - |      - |   28.7 |
>              UDP RR latency over IPv4: host to ns             |      - |      - |      - |     29 |
>                                                               '--------'--------'--------'--------'
> 
> ...except for the cases with low MTUs, throughput is significantly
> higher if we read and send one message at a time on the "spliced" path.
> 
> Next, I would like to:
> 
> - bisect between 32 and 1 for UDP_MAX_FRAMES: maybe 32 affects data
>   locality too much, but some lower value would still be beneficial by
>   lowering syscall overhead

Ok.

> - try with sendmsg() instead of sendmmsg(), at this point. Looking at
>   the kernel, that doesn't seem to make a real difference.

Which sendmmsg() specifically are you looking at changing?

> About this series: should we just go ahead and apply it with
> UDP_MAX_FRAMES set to 1 for the moment being? It's anyway better than
> the existing situation.

I think that's a good idea - or rather, not setting UDP_MAX_FRAMES to
1, but clamping the batch size to 1 for pasta - I'm pretty sure we
still want the batching for passt.  We lose a little bit on
small-packet spliced, but we gain on both tap and large-packet
spliced.  This will unblock the dual stack udp stuff and we can
further tune it later.

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2023-01-04  4:58 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-05  8:14 [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
2022-12-05  8:14 ` [PATCH 1/8] udp: Move sending pasta tap frames to the end of udp_sock_handler() David Gibson
2022-12-05  8:14 ` [PATCH 2/8] udp: Split sending to passt tap interface into separate function David Gibson
2022-12-05  8:14 ` [PATCH 3/8] udp: Split receive from preparation and send in udp_sock_handler() David Gibson
2022-12-05  8:14 ` [PATCH 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path David Gibson
2022-12-13 22:48   ` Stefano Brivio
2022-12-14  1:42     ` David Gibson
2022-12-14 10:35       ` Stefano Brivio
2022-12-20 10:42         ` Stefano Brivio
2022-12-21  6:00           ` David Gibson
2022-12-22  2:37             ` Stefano Brivio
2023-01-04  0:08             ` Stefano Brivio
2023-01-04  4:53               ` David Gibson
2022-12-05  8:14 ` [PATCH 5/8] udp: Pre-populate msg_names with local address David Gibson
2022-12-13 22:48   ` Stefano Brivio
2022-12-14  1:22     ` David Gibson
2022-12-05  8:14 ` [PATCH 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler() David Gibson
2022-12-13 22:48   ` Stefano Brivio
2022-12-14  1:19     ` David Gibson
2022-12-14 10:35       ` Stefano Brivio
2022-12-05  8:14 ` [PATCH 7/8] udp: Decide whether to "splice" per datagram rather than per socket David Gibson
2022-12-13 22:49   ` Stefano Brivio
2022-12-14  1:47     ` David Gibson
2022-12-14 10:35       ` Stefano Brivio
2022-12-15  0:33         ` David Gibson
2022-12-05  8:14 ` [PATCH 8/8] udp: Don't use separate sockets to listen for spliced packets David Gibson
2022-12-06  6:45 ` [PATCH 0/8] Don't use additional sockets for receiving "spliced" UDP communications Stefano Brivio
2022-12-06  6:46   ` Stefano Brivio

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).