* [PATCH] Reduce tcp_buf_discard size
@ 2025-09-08 11:04 xugu
2025-09-09 16:13 ` Stefano Brivio
0 siblings, 1 reply; 2+ messages in thread
From: xugu @ 2025-09-08 11:04 UTC (permalink / raw)
To: passt-dev; +Cc: xugu
From: Xun Gu <xugu@redhat.com>
On kernels without SO_PEEK_OFF, a 16MB static buffer is used to
discard sent data. This patch reduces the buffer to 1MB.
Larger discards are now handled by using multiple iovec entries
pointing to the same 1MB buffer.
Signed-off-by: Xun Gu <xugu@redhat.com>
---
tcp.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++-
tcp_buf.c | 18 +++++---------
tcp_internal.h | 7 +++++-
tcp_vu.c | 17 ++++---------
4 files changed, 82 insertions(+), 26 deletions(-)
diff --git a/tcp.c b/tcp.c
index a27b069..253cdb3 100644
--- a/tcp.c
+++ b/tcp.c
@@ -399,7 +399,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
-char tcp_buf_discard [MAX_WINDOW];
+char tcp_buf_discard [BUF_DISCARD_SIZE];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
@@ -3766,3 +3766,67 @@ fail:
return 0;
}
+
+/**
+ * tcp_prepare_iov() - Prepare iov according to kernel capability
+ * @msg: Message header to update
+ * @iov: iovec to receive TCP payload and data to discard
+ * @already_sent: Bytes sent after the last acknowledged one
+ * @payload_iov_cnt: Number of TCP payload iovec entries
+ *
+ * Return: 0 on success, -1 if already_sent cannot be discarded fully
+ */
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+ uint32_t already_sent, int payload_iov_cnt)
+{
+ /*
+ * IOV layout
+ * |- tcp_buf_discard -|---------- TCP data slots ------------|
+ *
+ * with discarded data:
+ * |------ddddddddddddd|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ *
+ * without discarded data:
+ * |-------------------|ttttttttttttt-------------------------|
+ * ^
+ * |
+ * msg_iov
+ * d: discard data
+ * t: TCP data
+ */
+ if (peek_offset_cap) {
+ msg->msg_iov = iov + DISCARD_IOV_NUM;
+ msg->msg_iovlen = payload_iov_cnt;
+ } else {
+ int discard_cnt, discard_iov_rem;
+ struct iovec *iov_start;
+ int i;
+
+ discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE);
+ if (discard_cnt > DISCARD_IOV_NUM) {
+ debug("Failed to discard %u already sent bytes",
+ already_sent);
+ return -1;
+ }
+
+ discard_iov_rem = already_sent % BUF_DISCARD_SIZE;
+
+ iov_start = iov + (DISCARD_IOV_NUM - discard_cnt);
+
+ /* Multiple iov entries pointing to the same buffer */
+ for (i = 0; i < discard_cnt; i++) {
+ iov_start[i].iov_base = tcp_buf_discard;
+ iov_start[i].iov_len = BUF_DISCARD_SIZE;
+ }
+ if (discard_iov_rem)
+ iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem;
+
+ msg->msg_iov = iov_start;
+ msg->msg_iovlen = discard_cnt + payload_iov_cnt;
+ }
+
+ return 0;
+}
diff --git a/tcp_buf.c b/tcp_buf.c
index bc898de..4ebb013 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -60,7 +60,7 @@ static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
static unsigned int tcp_payload_used;
/* recvmsg()/sendmsg() data for tap */
-static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
+static struct iovec iov_sock [TCP_FRAMES_MEM + DISCARD_IOV_NUM];
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
@@ -326,15 +326,9 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
iov_rem = (wnd_scaled - already_sent) % mss;
}
- /* Prepare iov according to kernel capability */
- if (!peek_offset_cap) {
- mh_sock.msg_iov = iov_sock;
- iov_sock[0].iov_base = tcp_buf_discard;
- iov_sock[0].iov_len = already_sent;
- mh_sock.msg_iovlen = fill_bufs + 1;
- } else {
- mh_sock.msg_iov = &iov_sock[1];
- mh_sock.msg_iovlen = fill_bufs;
+ if (tcp_prepare_iov(&mh_sock, iov_sock, already_sent, fill_bufs)) {
+ tcp_rst(c, conn);
+ return -1;
}
if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
@@ -344,12 +338,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
tcp_payload_used = 0;
}
- for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
+ for (i = 0, iov = iov_sock + DISCARD_IOV_NUM; i < fill_bufs; i++, iov++) {
iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
- iov_sock[fill_bufs].iov_len = iov_rem;
+ iov_sock[fill_bufs + DISCARD_IOV_NUM - 1].iov_len = iov_rem;
/* Receive into buffers, don't dequeue until acknowledged by guest. */
do
diff --git a/tcp_internal.h b/tcp_internal.h
index 9dae688..d0009f8 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -9,6 +9,9 @@
#define MAX_WS 8
#define MAX_WINDOW (1 << (16 + (MAX_WS)))
+#define BUF_DISCARD_SIZE (1 << 20)
+#define DISCARD_IOV_NUM DIV_ROUND_UP(MAX_WINDOW, BUF_DISCARD_SIZE)
+
#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
sizeof(struct tcphdr) - \
sizeof(struct iphdr), \
@@ -139,7 +142,7 @@ struct tcp_syn_opts {
.ws = TCP_OPT_WS(ws_), \
})
-extern char tcp_buf_discard [MAX_WINDOW];
+extern char tcp_buf_discard [BUF_DISCARD_SIZE];
void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
unsigned long flag);
@@ -180,4 +183,6 @@ int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
size_t *optlen);
int tcp_set_peek_offset(const struct tcp_tap_conn *conn, int offset);
+int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
+ uint32_t already_sent, int payload_iov_cnt);
#endif /* TCP_INTERNAL_H */
diff --git a/tcp_vu.c b/tcp_vu.c
index cb39bc2..097ca13 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -35,7 +35,7 @@
#include "vu_common.h"
#include <time.h>
-static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
+static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM];
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
static int head[VIRTQUEUE_MAX_SIZE + 1];
@@ -200,7 +200,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
hdrlen = tcp_vu_hdrlen(v6);
- vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
+ vu_init_elem(elem, &iov_vu[DISCARD_IOV_NUM], VIRTQUEUE_MAX_SIZE);
elem_cnt = 0;
*head_cnt = 0;
@@ -228,16 +228,9 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
elem_cnt += cnt;
}
- if (peek_offset_cap) {
- mh_sock.msg_iov = iov_vu + 1;
- mh_sock.msg_iovlen = elem_cnt;
- } else {
- iov_vu[0].iov_base = tcp_buf_discard;
- iov_vu[0].iov_len = already_sent;
-
- mh_sock.msg_iov = iov_vu;
- mh_sock.msg_iovlen = elem_cnt + 1;
- }
+ if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt))
+ /* Expect caller to do a TCP reset */
+ return -1;
do
ret = recvmsg(s, &mh_sock, MSG_PEEK);
--
@@ -35,7 +35,7 @@
#include "vu_common.h"
#include <time.h>
-static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + 1];
+static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE + DISCARD_IOV_NUM];
static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE];
static int head[VIRTQUEUE_MAX_SIZE + 1];
@@ -200,7 +200,7 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
hdrlen = tcp_vu_hdrlen(v6);
- vu_init_elem(elem, &iov_vu[1], VIRTQUEUE_MAX_SIZE);
+ vu_init_elem(elem, &iov_vu[DISCARD_IOV_NUM], VIRTQUEUE_MAX_SIZE);
elem_cnt = 0;
*head_cnt = 0;
@@ -228,16 +228,9 @@ static ssize_t tcp_vu_sock_recv(const struct ctx *c,
elem_cnt += cnt;
}
- if (peek_offset_cap) {
- mh_sock.msg_iov = iov_vu + 1;
- mh_sock.msg_iovlen = elem_cnt;
- } else {
- iov_vu[0].iov_base = tcp_buf_discard;
- iov_vu[0].iov_len = already_sent;
-
- mh_sock.msg_iov = iov_vu;
- mh_sock.msg_iovlen = elem_cnt + 1;
- }
+ if (tcp_prepare_iov(&mh_sock, iov_vu, already_sent, elem_cnt))
+ /* Expect caller to do a TCP reset */
+ return -1;
do
ret = recvmsg(s, &mh_sock, MSG_PEEK);
--
2.51.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH] Reduce tcp_buf_discard size
2025-09-08 11:04 [PATCH] Reduce tcp_buf_discard size xugu
@ 2025-09-09 16:13 ` Stefano Brivio
0 siblings, 0 replies; 2+ messages in thread
From: Stefano Brivio @ 2025-09-09 16:13 UTC (permalink / raw)
To: xugu; +Cc: passt-dev, Jon Maloy, Laurent Vivier
Thanks for the patch, it looks good to me and all tests pass with and
without SO_PEEK_OFF support!
Jon, Laurent, would you mind having a quick look before I apply this?
Gu, there's just one stray / trailing whitespace, indicated below, but
there's no need to send a new version for that, I will just drop it on
merge:
On Mon, 8 Sep 2025 20:04:39 +0900
"xugu@redhat.com" <xugu@redhat.com> wrote:
> From: Xun Gu <xugu@redhat.com>
>
> On kernels without SO_PEEK_OFF, a 16MB static buffer is used to
> discard sent data. This patch reduces the buffer to 1MB.
>
> Larger discards are now handled by using multiple iovec entries
> pointing to the same 1MB buffer.
>
> Signed-off-by: Xun Gu <xugu@redhat.com>
> ---
> tcp.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++-
> tcp_buf.c | 18 +++++---------
> tcp_internal.h | 7 +++++-
> tcp_vu.c | 17 ++++---------
> 4 files changed, 82 insertions(+), 26 deletions(-)
>
> diff --git a/tcp.c b/tcp.c
> index a27b069..253cdb3 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -399,7 +399,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
> */
> static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
>
> -char tcp_buf_discard [MAX_WINDOW];
> +char tcp_buf_discard [BUF_DISCARD_SIZE];
>
> /* Does the kernel support TCP_PEEK_OFF? */
> bool peek_offset_cap;
> @@ -3766,3 +3766,67 @@ fail:
>
> return 0;
> }
> +
> +/**
> + * tcp_prepare_iov() - Prepare iov according to kernel capability
> + * @msg: Message header to update
> + * @iov: iovec to receive TCP payload and data to discard
> + * @already_sent: Bytes sent after the last acknowledged one
> + * @payload_iov_cnt: Number of TCP payload iovec entries
> + *
> + * Return: 0 on success, -1 if already_sent cannot be discarded fully
> + */
> +int tcp_prepare_iov(struct msghdr *msg, struct iovec *iov,
> + uint32_t already_sent, int payload_iov_cnt)
> +{
> + /*
> + * IOV layout
> + * |- tcp_buf_discard -|---------- TCP data slots ------------|
> + *
> + * with discarded data:
> + * |------ddddddddddddd|ttttttttttttt-------------------------|
> + * ^
> + * |
> + * msg_iov
> + *
> + * without discarded data:
> + * |-------------------|ttttttttttttt-------------------------|
> + * ^
> + * |
> + * msg_iov
> + * d: discard data
> + * t: TCP data
> + */
> + if (peek_offset_cap) {
> + msg->msg_iov = iov + DISCARD_IOV_NUM;
> + msg->msg_iovlen = payload_iov_cnt;
> + } else {
> + int discard_cnt, discard_iov_rem;
> + struct iovec *iov_start;
> + int i;
> +
> + discard_cnt = DIV_ROUND_UP(already_sent, BUF_DISCARD_SIZE);
> + if (discard_cnt > DISCARD_IOV_NUM) {
> + debug("Failed to discard %u already sent bytes",
> + already_sent);
> + return -1;
> + }
> +
> + discard_iov_rem = already_sent % BUF_DISCARD_SIZE;
> +
> + iov_start = iov + (DISCARD_IOV_NUM - discard_cnt);
> +
> + /* Multiple iov entries pointing to the same buffer */
> + for (i = 0; i < discard_cnt; i++) {
> + iov_start[i].iov_base = tcp_buf_discard;
> + iov_start[i].iov_len = BUF_DISCARD_SIZE;
> + }
> + if (discard_iov_rem)
> + iov[DISCARD_IOV_NUM - 1].iov_len = discard_iov_rem;
> +
> + msg->msg_iov = iov_start;
> + msg->msg_iovlen = discard_cnt + payload_iov_cnt;
> + }
> +
> + return 0;
> +}
> diff --git a/tcp_buf.c b/tcp_buf.c
> index bc898de..4ebb013 100644
> --- a/tcp_buf.c
> +++ b/tcp_buf.c
> @@ -60,7 +60,7 @@ static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
> static unsigned int tcp_payload_used;
>
> /* recvmsg()/sendmsg() data for tap */
> -static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
> +static struct iovec iov_sock [TCP_FRAMES_MEM + DISCARD_IOV_NUM];
>
> static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
>
> @@ -326,15 +326,9 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
> iov_rem = (wnd_scaled - already_sent) % mss;
> }
>
> - /* Prepare iov according to kernel capability */
> - if (!peek_offset_cap) {
> - mh_sock.msg_iov = iov_sock;
> - iov_sock[0].iov_base = tcp_buf_discard;
> - iov_sock[0].iov_len = already_sent;
> - mh_sock.msg_iovlen = fill_bufs + 1;
> - } else {
> - mh_sock.msg_iov = &iov_sock[1];
> - mh_sock.msg_iovlen = fill_bufs;
> + if (tcp_prepare_iov(&mh_sock, iov_sock, already_sent, fill_bufs)) {
> + tcp_rst(c, conn);
> + return -1;
> }
>
> if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
> @@ -344,12 +338,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
> tcp_payload_used = 0;
> }
>
> - for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
> + for (i = 0, iov = iov_sock + DISCARD_IOV_NUM; i < fill_bufs; i++, iov++) {
> iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
> iov->iov_len = mss;
> }
> if (iov_rem)
> - iov_sock[fill_bufs].iov_len = iov_rem;
> + iov_sock[fill_bufs + DISCARD_IOV_NUM - 1].iov_len = iov_rem;
>
> /* Receive into buffers, don't dequeue until acknowledged by guest. */
> do
> diff --git a/tcp_internal.h b/tcp_internal.h
> index 9dae688..d0009f8 100644
> --- a/tcp_internal.h
> +++ b/tcp_internal.h
> @@ -9,6 +9,9 @@
> #define MAX_WS 8
> #define MAX_WINDOW (1 << (16 + (MAX_WS)))
>
> +#define BUF_DISCARD_SIZE (1 << 20)
^ ...here, after the ')'
(git log/show shows it in red).
--
Stefano
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2025-09-09 16:14 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-09-08 11:04 [PATCH] Reduce tcp_buf_discard size xugu
2025-09-09 16:13 ` Stefano Brivio
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).