public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: Stefano Brivio <sbrivio@redhat.com>, passt-dev@passt.top
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH v3 18/18] udp: Use tap_send_frames()
Date: Fri,  6 Jan 2023 11:43:22 +1100	[thread overview]
Message-ID: <20230106004322.985665-19-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20230106004322.985665-1-david@gibson.dropbear.id.au>

To send frames on the tap interface, the UDP uses a fairly complicated two
level batching.  First multiple frames are gathered into a single "message"
for the qemu stream socket, then multiple messages are send with
sendmmsg().  We now have tap_send_frames() which already deals with sending
a number of frames, including batching and handling partial sends.  Use
that to considerably simplify things.

This does make a couple of behavioural changes:
  * We used to split messages to keep them under 32kiB (except when a
    single frame was longer than that).  The comments claim this is
    needed to stop qemu from closing the connection, but we don't have any
    equivalent logic for TCP.  I wasn't able to reproduce the problem with
    this series, although it was apparently easy to reproduce earlier.

    My suspicion is that there was never an inherent need to keep messages
    small, however with larger messages (and default kernel buffer sizes)
    the chances of needing more than one resend for partial send()s is
    greatly increased.  We used not to correctly handle that case of
    multiple resends, but now we do.

  * Previously when we got a partial send on UDP, we would resend the
    remainder of the entire "message", including multiple frames.  The
    common code now only resends the remainder of a single frame, simply
    dropping any frames which weren't even partially sent.  This is what
    TCP always did and is probably a better idea for UDP too.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 pcap.c |  30 ------------
 pcap.h |   1 -
 udp.c  | 141 +++------------------------------------------------------
 udp.h  |   2 +-
 4 files changed, 8 insertions(+), 166 deletions(-)

diff --git a/pcap.c b/pcap.c
index 7dca599..c7c6ffd 100644
--- a/pcap.c
+++ b/pcap.c
@@ -130,36 +130,6 @@ void pcap_multiple(const struct iovec *iov, unsigned int n, size_t offset)
 	}
 }
 
-/**
- * pcapm() - Capture multiple frames from multiple message headers to pcap file
- * @mmh:	Pointer to first sendmmsg() header
- */
-void pcapmm(const struct mmsghdr *mmh, unsigned int vlen)
-{
-	struct timeval tv;
-	unsigned int i, j;
-
-	if (pcap_fd == -1)
-		return;
-
-	gettimeofday(&tv, NULL);
-
-	for (i = 0; i < vlen; i++) {
-		const struct msghdr *mh = &mmh[i].msg_hdr;
-
-		for (j = 0; j < mh->msg_iovlen; j++) {
-			const struct iovec *iov = &mh->msg_iov[j];
-
-			if (pcap_frame((char *)iov->iov_base + 4,
-				       iov->iov_len - 4, &tv) != 0) {
-				debug("Cannot log packet, length %lu",
-				      iov->iov_len - 4);
-				return;
-			}
-		}
-	}
-}
-
 /**
  * pcap_init() - Initialise pcap file
  * @c:		Execution context
diff --git a/pcap.h b/pcap.h
index eafc89b..c2af3cf 100644
--- a/pcap.h
+++ b/pcap.h
@@ -8,7 +8,6 @@
 
 void pcap(const char *pkt, size_t len);
 void pcap_multiple(const struct iovec *iov, unsigned int n, size_t offset);
-void pcapmm(const struct mmsghdr *mmh, unsigned int vlen);
 void pcap_init(struct ctx *c);
 
 #endif /* PCAP_H */
diff --git a/udp.c b/udp.c
index 934d32e..adb47e8 100644
--- a/udp.c
+++ b/udp.c
@@ -224,9 +224,6 @@ static struct iovec	udp6_l2_iov_tap		[UDP_MAX_FRAMES];
 static struct mmsghdr	udp4_l2_mh_sock		[UDP_MAX_FRAMES];
 static struct mmsghdr	udp6_l2_mh_sock		[UDP_MAX_FRAMES];
 
-static struct mmsghdr	udp4_l2_mh_tap		[UDP_MAX_FRAMES];
-static struct mmsghdr	udp6_l2_mh_tap		[UDP_MAX_FRAMES];
-
 /* recvmmsg()/sendmmsg() data for "spliced" connections */
 static struct iovec	udp4_iov_splice		[UDP_MAX_FRAMES];
 static struct iovec	udp6_iov_splice		[UDP_MAX_FRAMES];
@@ -336,12 +333,9 @@ static void udp_sock4_iov_init(const struct ctx *c)
 	}
 
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
-		struct msghdr *mh = &udp4_l2_mh_tap[i].msg_hdr;
 		struct iovec *iov = &udp4_l2_iov_tap[i];
 
-		iov->iov_base	= tap_iov_base(c, &udp4_l2_buf[i].taph);
-		mh->msg_iov	= iov;
-		mh->msg_iovlen	= 1;
+		iov->iov_base = tap_iov_base(c, &udp4_l2_buf[i].taph);
 	}
 }
 
@@ -374,12 +368,9 @@ static void udp_sock6_iov_init(const struct ctx *c)
 	}
 
 	for (i = 0; i < UDP_MAX_FRAMES; i++) {
-		struct msghdr *mh = &udp6_l2_mh_tap[i].msg_hdr;
 		struct iovec *iov = &udp6_l2_iov_tap[i];
 
-		iov->iov_base	= tap_iov_base(c, &udp6_l2_buf[i].taph);
-		mh->msg_iov	= iov;
-		mh->msg_iovlen	= 1;
+		iov->iov_base = tap_iov_base(c, &udp6_l2_buf[i].taph);
 	}
 }
 
@@ -703,102 +694,6 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
 	return tap_iov_len(c, &b->taph, ip_len);
 }
 
-/**
- * udp_tap_send_pasta() - Send datagrams to the pasta tap interface
- * @c:		Execution context
- * @mmh:	Array of message headers to send
- * @n:		Number of message headers to send
- *
- * #syscalls:pasta write
- */
-static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
-			       unsigned int n)
-{
-	unsigned int i, j;
-
-	for (i = 0; i < n; i++) {
-		for (j = 0; j < mmh[i].msg_hdr.msg_iovlen; j++) {
-			struct iovec *iov = &mmh[i].msg_hdr.msg_iov[j];
-
-			/* We can't use writev() because the tap
-			 * character device relies on the write()
-			 * boundaries to discern frame boundaries
-			 */
-			if (write(c->fd_tap, iov->iov_base, iov->iov_len) < 0)
-				debug("tap write: %s", strerror(errno));
-			else
-				pcap(iov->iov_base, iov->iov_len);
-		}
-	}
-}
-
-/**
- * udp_tap_send_passt() - Send datagrams to the passt tap interface
- * @c:		Execution context
- * @mmh:	Array of message headers to send
- * @n:		Number of message headers to send
- *
- * #syscalls:passt sendmmsg sendmsg
- */
-static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
-{
-	struct msghdr *last_mh;
-	ssize_t missing = 0;
-	size_t msg_len = 0;
-	unsigned int i;
-	int ret;
-
-	ret = sendmmsg(c->fd_tap, mmh, n, MSG_NOSIGNAL | MSG_DONTWAIT);
-	if (ret <= 0)
-		return;
-
-	/* If we lose some messages to sendmmsg() here, fine, it's UDP. However,
-	 * the last message needs to be delivered completely, otherwise qemu
-	 * will fail to reassemble the next message and close the connection. Go
-	 * through headers from the last sent message, counting bytes, and, if
-	 * and as soon as we see more bytes than sendmmsg() sent, re-send the
-	 * rest with a blocking call.
-	 *
-	 * In pictures, given this example:
-	 *
-	 *				 	iov #0  iov #1  iov #2  iov #3
-	 * tap_mmh[ret - 1].msg_hdr:		....    ......  .....   ......
-	 * tap_mmh[ret - 1].msg_len:	7	....    ...
-	 *
-	 * when 'msglen' reaches:	10		      ^
-	 * and 'missing' below is:	3	           ---
-	 *
-	 * re-send everything from here:		   ^--  -----   ------
-	 */
-	last_mh = &mmh[ret - 1].msg_hdr;
-	for (i = 0; i < last_mh->msg_iovlen; i++) {
-		if (missing <= 0) {
-			msg_len += last_mh->msg_iov[i].iov_len;
-			missing = msg_len - mmh[ret - 1].msg_len;
-		}
-
-		if (missing > 0) {
-			uint8_t **iov_base;
-			int first_offset;
-
-			iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base;
-			first_offset = last_mh->msg_iov[i].iov_len - missing;
-			*iov_base += first_offset;
-			last_mh->msg_iov[i].iov_len = missing;
-
-			last_mh->msg_iov = &last_mh->msg_iov[i];
-
-			if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0)
-				debug("UDP: %li bytes to tap missing", missing);
-
-			*iov_base -= first_offset;
-			break;
-		}
-	}
-
-	pcapmm(mmh, ret);
-}
-
 /**
  * udp_tap_send() - Prepare UDP datagrams and send to tap interface
  * @c:		Execution context
@@ -810,25 +705,18 @@ static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
  *
  * Return: size of tap frame with headers
  */
-static void udp_tap_send(const struct ctx *c,
+static void udp_tap_send(struct ctx *c,
 			 unsigned int start, unsigned int n,
 			 in_port_t dstport, bool v6, const struct timespec *now)
 {
-	int msg_bufs = 0, msg_i = 0;
-	struct mmsghdr *tap_mmh;
 	struct iovec *tap_iov;
-	ssize_t msg_len = 0;
 	unsigned int i;
 
-	if (v6) {
-		tap_mmh = udp6_l2_mh_tap;
+	if (v6)
 		tap_iov = udp6_l2_iov_tap;
-	} else {
-		tap_mmh = udp4_l2_mh_tap;
+	else
 		tap_iov = udp4_l2_iov_tap;
-	}
 
-	tap_mmh[0].msg_hdr.msg_iov = &tap_iov[start];
 	for (i = start; i < start + n; i++) {
 		size_t buf_len;
 
@@ -838,24 +726,9 @@ static void udp_tap_send(const struct ctx *c,
 			buf_len = udp_update_hdr4(c, i, dstport, now);
 
 		tap_iov[i].iov_len = buf_len;
-
-		/* With bigger messages, qemu closes the connection. */
-		if (c->mode == MODE_PASST && msg_bufs &&
-		    msg_len + buf_len > SHRT_MAX) {
-			tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
-			msg_i++;
-			tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i];
-			msg_len = msg_bufs = 0;
-		}
-		msg_len += buf_len;
-		msg_bufs++;
 	}
-	tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
 
-	if (c->mode == MODE_PASTA)
-		udp_tap_send_pasta(c, tap_mmh, msg_i + 1);
-	else
-		udp_tap_send_passt(c, tap_mmh, msg_i + 1);
+	tap_send_frames(c, tap_iov + start, n);
 }
 
 /**
@@ -867,7 +740,7 @@ static void udp_tap_send(const struct ctx *c,
  *
  * #syscalls recvmmsg
  */
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 		      const struct timespec *now)
 {
 	/* For not entirely clear reasons (data locality?) pasta gets
diff --git a/udp.h b/udp.h
index 2a03335..68082ea 100644
--- a/udp.h
+++ b/udp.h
@@ -8,7 +8,7 @@
 
 #define UDP_TIMER_INTERVAL		1000 /* ms */
 
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 		      const struct timespec *now);
 int udp_tap_handler(struct ctx *c, int af, const void *addr,
 		    const struct pool *p, const struct timespec *now);
-- 
@@ -8,7 +8,7 @@
 
 #define UDP_TIMER_INTERVAL		1000 /* ms */
 
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+void udp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events,
 		      const struct timespec *now);
 int udp_tap_handler(struct ctx *c, int af, const void *addr,
 		    const struct pool *p, const struct timespec *now);
-- 
2.39.0


  parent reply	other threads:[~2023-01-06  0:43 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-01-06  0:43 [PATCH v3 00/18] RFC: Unify and simplify tap send path David Gibson
2023-01-06  0:43 ` [PATCH v3 01/18] pcap: Introduce pcap_frame() helper David Gibson
2023-01-06  0:43 ` [PATCH v3 02/18] pcap: Replace pcapm() with pcap_multiple() David Gibson
2023-01-06  0:43 ` [PATCH v3 03/18] tcp: Combine two parts of passt tap send path together David Gibson
2023-01-06  0:43 ` [PATCH v3 04/18] tcp: Don't compute total bytes in a message until we need it David Gibson
2023-01-06  0:43 ` [PATCH v3 05/18] tcp: Improve interface to tcp_l2_buf_flush() David Gibson
2023-01-06  0:43 ` [PATCH v3 06/18] tcp: Combine two parts of pasta tap send path together David Gibson
2023-02-13  1:13   ` Stefano Brivio
2023-01-06  0:43 ` [PATCH v3 07/18] tap, tcp: Move tap send path to tap.c David Gibson
2023-01-06  0:43 ` [PATCH v3 08/18] util: Introduce hton*_constant() in place of #ifdefs David Gibson
2023-01-06  0:43 ` [PATCH v3 09/18] tcp, udp: Use named field initializers in iov_init functions David Gibson
2023-01-06  0:43 ` [PATCH v3 10/18] util: Parameterize ethernet header initializer macro David Gibson
2023-01-06  0:43 ` [PATCH v3 11/18] tcp: Remove redundant and incorrect initialization from *_iov_init() David Gibson
2023-01-06  0:43 ` [PATCH v3 12/18] tcp: Consolidate calculation of total frame size David Gibson
2023-01-06  0:43 ` [PATCH v3 13/18] tap: Add "tap headers" abstraction David Gibson
2023-01-06  0:43 ` [PATCH v3 14/18] tcp: Use abstracted tap header David Gibson
2023-01-06  0:43 ` [PATCH v3 15/18] tap: Use different io vector bases depending on tap type David Gibson
2023-01-06  0:43 ` [PATCH v3 16/18] udp: Use abstracted tap header David Gibson
2023-01-06  0:43 ` [PATCH v3 17/18] tap: Improve handling of partial frame sends David Gibson
2023-01-06  0:43 ` David Gibson [this message]
2023-01-24 21:20 ` [PATCH v3 00/18] RFC: Unify and simplify tap send path Stefano Brivio
2023-01-25  3:13   ` David Gibson
2023-01-25 23:21     ` Stefano Brivio
2023-02-13  1:14       ` Stefano Brivio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230106004322.985665-19-david@gibson.dropbear.id.au \
    --to=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).