public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Stefano Brivio <sbrivio@redhat.com>
To: David Gibson <david@gibson.dropbear.id.au>,
	Matej Hrica <mhrica@redhat.com>
Cc: passt-dev@passt.top
Subject: [PATCH RFT 4/5] tcp, tap: Don't increase tap-side sequence counter for dropped frames
Date: Sat, 23 Sep 2023 00:06:09 +0200	[thread overview]
Message-ID: <20230922220610.58767-5-sbrivio@redhat.com> (raw)
In-Reply-To: <20230922220610.58767-1-sbrivio@redhat.com>

...so that we'll retry sending them, instead of more-or-less silently
dropping them. This happens quite frequently if our sending buffer on
the UNIX domain socket is heavily constrained (for instance, by the
208 KiB default memory limit).

It might be argued that dropping frames is part of the expected TCP
flow: we don't dequeue those from the socket anyway, so we'll
eventually retransmit them.

But we don't need the receiver to tell us (by the way of duplicate or
missing ACKs) that we couldn't send them: we already know as
sendmsg() reports that. This seems to considerably increase
throughput stability and throughput itself for TCP connections with
default wmem_max values.

Unfortunately, the 16 bits left as padding in the frame descriptors
we use internally aren't enough to uniquely identify for which
connection we should update sequence numbers: create a parallel
array of pointers to sequence numbers and L4 lengths, of
TCP_FRAMES_MEM size, and go through it after calling sendmsg().

Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 tap.c | 10 +++++++---
 tap.h |  2 +-
 tcp.c | 43 ++++++++++++++++++++++++++++++++++++-------
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/tap.c b/tap.c
index 93db989..b30ff81 100644
--- a/tap.c
+++ b/tap.c
@@ -413,13 +413,15 @@ static size_t tap_send_frames_passt(const struct ctx *c,
  * @c:		Execution context
  * @iov:	Array of buffers, each containing one frame (with L2 headers)
  * @n:		Number of buffers/frames in @iov
+ *
+ * Return: number of frames actually sent
  */
-void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n)
+size_t tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n)
 {
 	size_t m;
 
 	if (!n)
-		return;
+		return 0;
 
 	if (c->mode == MODE_PASST)
 		m = tap_send_frames_passt(c, iov, n);
@@ -427,9 +429,11 @@ void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n)
 		m = tap_send_frames_pasta(c, iov, n);
 
 	if (m < n)
-		debug("tap: dropped %lu frames of %lu due to short send", n - m, n);
+		debug("tap: failed to send %lu frames of %lu", n - m, n);
 
 	pcap_multiple(iov, m, c->mode == MODE_PASST ? sizeof(uint32_t) : 0);
+
+	return m;
 }
 
 /**
diff --git a/tap.h b/tap.h
index 021fb7c..952fafc 100644
--- a/tap.h
+++ b/tap.h
@@ -73,7 +73,7 @@ void tap_icmp6_send(const struct ctx *c,
 		    const struct in6_addr *src, const struct in6_addr *dst,
 		    void *in, size_t len);
 int tap_send(const struct ctx *c, const void *data, size_t len);
-void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n);
+size_t tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n);
 void tap_update_mac(struct tap_hdr *taph,
 		    const unsigned char *eth_d, const unsigned char *eth_s);
 void tap_listen_handler(struct ctx *c, uint32_t events);
diff --git a/tcp.c b/tcp.c
index 4606f17..76b7b8d 100644
--- a/tcp.c
+++ b/tcp.c
@@ -434,6 +434,16 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
  */
 static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
+/**
+ * tcp_buf_seq_update - Sequences to update with length of frames once sent
+ * @seq:	Pointer to sequence number sent to tap-side, to be updated
+ * @len:	TCP payload length
+ */
+struct tcp_buf_seq_update {
+	uint32_t *seq;
+	uint16_t len;
+};
+
 /* Static buffers */
 
 /**
@@ -462,6 +472,8 @@ static struct tcp4_l2_buf_t {
 #endif
 tcp4_l2_buf[TCP_FRAMES_MEM];
 
+static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
+
 static unsigned int tcp4_l2_buf_used;
 
 /**
@@ -490,6 +502,8 @@ struct tcp6_l2_buf_t {
 #endif
 tcp6_l2_buf[TCP_FRAMES_MEM];
 
+static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
+
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
@@ -1369,10 +1383,17 @@ static void tcp_l2_flags_buf_flush(struct ctx *c)
  */
 static void tcp_l2_data_buf_flush(struct ctx *c)
 {
-	tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
+	unsigned i;
+	size_t m;
+
+	m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
+	for (i = 0; i < m; i++)
+		*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
 	tcp6_l2_buf_used = 0;
 
-	tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
+	m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
+	for (i = 0; i < m; i++)
+		*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
 	tcp4_l2_buf_used = 0;
 }
 
@@ -2149,10 +2170,11 @@ static int tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
  * @plen:	Payload length at L4
  * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
  * @seq:	Sequence number to be sent
- * @now:	Current timestamp
+ * @seq_update:	Pointer to sequence number to update on successful send
  */
 static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
-			    ssize_t plen, int no_csum, uint32_t seq)
+			    ssize_t plen, int no_csum, uint32_t seq,
+			    uint32_t *seq_update)
 {
 	struct iovec *iov;
 
@@ -2160,6 +2182,9 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
 		struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
 		uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
 
+		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
+		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
+
 		iov = tcp4_l2_iov + tcp4_l2_buf_used++;
 		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
 						       check, seq);
@@ -2168,6 +2193,9 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
 	} else if (CONN_V6(conn)) {
 		struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
 
+		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
+		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
+
 		iov = tcp6_l2_iov + tcp6_l2_buf_used++;
 		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
 						       NULL, seq);
@@ -2193,7 +2221,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	int s = conn->sock, i, ret = 0;
 	struct msghdr mh_sock = { 0 };
 	uint16_t mss = MSS_GET(conn);
-	uint32_t already_sent;
+	uint32_t already_sent, seq;
 	struct iovec *iov;
 
 	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
@@ -2282,14 +2310,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 
 	/* Finally, queue to tap */
 	plen = mss;
+	seq = conn->seq_to_tap;
 	for (i = 0; i < send_bufs; i++) {
 		int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
 
 		if (i == send_bufs - 1)
 			plen = last_len;
 
-		tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap);
-		conn->seq_to_tap += plen;
+		tcp_data_to_tap(c, conn, plen, no_csum, seq, &conn->seq_to_tap);
+		seq += plen;
 	}
 
 	conn_flag(c, conn, ACK_FROM_TAP_DUE);
-- 
@@ -434,6 +434,16 @@ static int tcp_sock_ns		[NUM_PORTS][IP_VERSIONS];
  */
 static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
 
+/**
+ * tcp_buf_seq_update - Sequences to update with length of frames once sent
+ * @seq:	Pointer to sequence number sent to tap-side, to be updated
+ * @len:	TCP payload length
+ */
+struct tcp_buf_seq_update {
+	uint32_t *seq;
+	uint16_t len;
+};
+
 /* Static buffers */
 
 /**
@@ -462,6 +472,8 @@ static struct tcp4_l2_buf_t {
 #endif
 tcp4_l2_buf[TCP_FRAMES_MEM];
 
+static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM];
+
 static unsigned int tcp4_l2_buf_used;
 
 /**
@@ -490,6 +502,8 @@ struct tcp6_l2_buf_t {
 #endif
 tcp6_l2_buf[TCP_FRAMES_MEM];
 
+static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM];
+
 static unsigned int tcp6_l2_buf_used;
 
 /* recvmsg()/sendmsg() data for tap */
@@ -1369,10 +1383,17 @@ static void tcp_l2_flags_buf_flush(struct ctx *c)
  */
 static void tcp_l2_data_buf_flush(struct ctx *c)
 {
-	tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
+	unsigned i;
+	size_t m;
+
+	m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used);
+	for (i = 0; i < m; i++)
+		*tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len;
 	tcp6_l2_buf_used = 0;
 
-	tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
+	m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used);
+	for (i = 0; i < m; i++)
+		*tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len;
 	tcp4_l2_buf_used = 0;
 }
 
@@ -2149,10 +2170,11 @@ static int tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq)
  * @plen:	Payload length at L4
  * @no_csum:	Don't compute IPv4 checksum, use the one from previous buffer
  * @seq:	Sequence number to be sent
- * @now:	Current timestamp
+ * @seq_update:	Pointer to sequence number to update on successful send
  */
 static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
-			    ssize_t plen, int no_csum, uint32_t seq)
+			    ssize_t plen, int no_csum, uint32_t seq,
+			    uint32_t *seq_update)
 {
 	struct iovec *iov;
 
@@ -2160,6 +2182,9 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
 		struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used];
 		uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL;
 
+		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update;
+		tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen;
+
 		iov = tcp4_l2_iov + tcp4_l2_buf_used++;
 		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
 						       check, seq);
@@ -2168,6 +2193,9 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
 	} else if (CONN_V6(conn)) {
 		struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used];
 
+		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update;
+		tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen;
+
 		iov = tcp6_l2_iov + tcp6_l2_buf_used++;
 		iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen,
 						       NULL, seq);
@@ -2193,7 +2221,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 	int s = conn->sock, i, ret = 0;
 	struct msghdr mh_sock = { 0 };
 	uint16_t mss = MSS_GET(conn);
-	uint32_t already_sent;
+	uint32_t already_sent, seq;
 	struct iovec *iov;
 
 	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
@@ -2282,14 +2310,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
 
 	/* Finally, queue to tap */
 	plen = mss;
+	seq = conn->seq_to_tap;
 	for (i = 0; i < send_bufs; i++) {
 		int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used;
 
 		if (i == send_bufs - 1)
 			plen = last_len;
 
-		tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap);
-		conn->seq_to_tap += plen;
+		tcp_data_to_tap(c, conn, plen, no_csum, seq, &conn->seq_to_tap);
+		seq += plen;
 	}
 
 	conn_flag(c, conn, ACK_FROM_TAP_DUE);
-- 
2.39.2


  parent reply	other threads:[~2023-09-22 22:06 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-09-22 22:06 [PATCH RFT 0/5] Fixes and a workaround for TCP stalls with small buffers Stefano Brivio
2023-09-22 22:06 ` [PATCH RFT 1/5] tcp: Fix comment to tcp_sock_consume() Stefano Brivio
2023-09-23  2:48   ` David Gibson
2023-09-22 22:06 ` [PATCH RFT 2/5] tcp: Reset STALLED flag on ACK only, check for pending socket data Stefano Brivio
2023-09-25  3:07   ` David Gibson
2023-09-27 17:05     ` Stefano Brivio
2023-09-28  1:48       ` David Gibson
2023-09-29 15:20         ` Stefano Brivio
2023-10-03  3:20           ` David Gibson
2023-10-05  6:18             ` Stefano Brivio
2023-10-05  7:36               ` David Gibson
2023-09-22 22:06 ` [PATCH RFT 3/5] tcp: Force TCP_WINDOW_CLAMP before resetting STALLED flag Stefano Brivio
2023-09-22 22:31   ` Stefano Brivio
2023-09-23  7:55   ` David Gibson
2023-09-25  4:09   ` David Gibson
2023-09-25  4:10     ` David Gibson
2023-09-25  4:21     ` David Gibson
2023-09-27 17:05       ` Stefano Brivio
2023-09-28  1:51         ` David Gibson
2023-09-22 22:06 ` Stefano Brivio [this message]
2023-09-25  4:47   ` [PATCH RFT 4/5] tcp, tap: Don't increase tap-side sequence counter for dropped frames David Gibson
2023-09-27 17:06     ` Stefano Brivio
2023-09-28  1:58       ` David Gibson
2023-09-29 15:19         ` Stefano Brivio
2023-10-03  3:22           ` David Gibson
2023-10-05  6:19             ` Stefano Brivio
2023-10-05  7:38               ` David Gibson
2023-09-22 22:06 ` [PATCH RFT 5/5] passt.1: Add note about tuning rmem_max and wmem_max for throughput Stefano Brivio
2023-09-25  4:57   ` David Gibson
2023-09-27 17:06     ` Stefano Brivio
2023-09-28  2:02       ` David Gibson
2023-09-25  5:52 ` [PATCH RFT 0/5] Fixes and a workaround for TCP stalls with small buffers David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230922220610.58767-5-sbrivio@redhat.com \
    --to=sbrivio@redhat.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=mhrica@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).