From: "Eugenio Pérez" <eperezma@redhat.com>
To: passt-dev@passt.top
Cc: jasowang@redhat.com
Subject: [RFC v2 09/11] tcp: start conversion to circular buffer
Date: Wed, 9 Jul 2025 19:47:46 +0200 [thread overview]
Message-ID: <20250709174748.3514693-10-eperezma@redhat.com> (raw)
In-Reply-To: <20250709174748.3514693-1-eperezma@redhat.com>
The vhost-kernel module is async by nature: the driver (pasta) places a
few buffers in the virtqueue and the device (vhost-kernel) trust the
driver will not modify them until it uses them. To implement it is not
possible with TCP at the moment, as tcp_buf trust it can reuse the
buffers as soon as tcp_payload_flush() finish.
To achieve async let's make tcp_buf work with a circular ring, so vhost
can transmit at the same time pasta is queing more data. When a buffer
is received from a TCP socket, the element is placed in the ring and
sock_head is moved:
[][][][]
^ ^
| |
| sock_head
|
tail
tap_head
When the data is sent to vhost through the tx queue, tap_head is moved
forward:
[][][][]
^ ^
| |
| sock_head
| tap_head
|
tail
Finally, the tail move forward when vhost has used the tx buffers, so
tcp_payload (and all lower protocol buffers) can be reused.
[][][][]
^
|
sock_head
tap_head
tail
In the case of error queueing to the vhost virtqueue, sock_head moves
backwards. The only possible error is that the queue is full, as
virtio-net does not report success on packet sending.
Starting as simple as possible, and only implementing the count
variables in this patch so it keeps working as previously. The circular
behavior will be added on top.
From ~16BGbit/s to ~13Gbit/s compared with write(2) to the tap.
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
---
tcp_buf.c | 63 +++++++++++++++++++++++++++++++++++--------------------
1 file changed, 40 insertions(+), 23 deletions(-)
diff --git a/tcp_buf.c b/tcp_buf.c
index 242086d..0437120 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -53,7 +53,12 @@ static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516")
/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
-static unsigned int tcp_payload_used;
+static unsigned int tcp_payload_sock_used, tcp_payload_tap_used;
+
+static void tcp_payload_sock_produce(size_t n)
+{
+ tcp_payload_sock_used += n;
+}
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
@@ -132,6 +137,16 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
}
}
+static void tcp_buf_free_old_tap_xmit(void)
+{
+ while (tcp_payload_tap_used) {
+ tap_free_old_xmit(tcp_payload_tap_used);
+
+ tcp_payload_tap_used = 0;
+ tcp_payload_sock_used = 0;
+ }
+}
+
/**
* tcp_payload_flush() - Send out buffers for segments with data or flags
* @c: Execution context
@@ -141,12 +156,13 @@ void tcp_payload_flush(const struct ctx *c)
size_t m;
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
- tcp_payload_used, false);
- if (m != tcp_payload_used) {
+ tcp_payload_sock_used, true);
+ if (m != tcp_payload_sock_used) {
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
- tcp_payload_used - m);
+ tcp_payload_sock_used - m);
}
- tcp_payload_used = 0;
+ tcp_payload_tap_used += m;
+ tcp_buf_free_old_tap_xmit();
}
/**
@@ -195,12 +211,12 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
uint32_t seq;
int ret;
- iov = tcp_l2_iov[tcp_payload_used];
+ iov = tcp_l2_iov[tcp_payload_sock_used];
if (CONN_V4(conn)) {
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else {
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
@@ -211,13 +227,14 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (ret <= 0)
return ret;
- tcp_payload_used++;
+ tcp_payload_sock_produce(1);
l4len = optlen + sizeof(struct tcphdr);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
if (flags & DUP_ACK) {
- struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
+ struct iovec *dup_iov = tcp_l2_iov[tcp_payload_sock_used];
+ tcp_payload_sock_produce(1);
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_TAP].iov_len);
@@ -228,8 +245,9 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
}
- if (tcp_payload_used > TCP_FRAMES_MEM - 2)
+ if (tcp_payload_sock_used > TCP_FRAMES_MEM - 2) {
tcp_payload_flush(c);
+ }
return 0;
}
@@ -251,19 +269,19 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
struct iovec *iov;
conn->seq_to_tap = seq + dlen;
- tcp_frame_conns[tcp_payload_used] = conn;
- iov = tcp_l2_iov[tcp_payload_used];
+ tcp_frame_conns[tcp_payload_sock_used] = conn;
+ iov = tcp_l2_iov[tcp_payload_sock_used];
if (CONN_V4(conn)) {
if (no_csum) {
- struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
+ struct iovec *iov_prev = tcp_l2_iov[tcp_payload_sock_used - 1];
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check;
}
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else if (CONN_V6(conn)) {
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
payload = iov[TCP_IOV_PAYLOAD].iov_base;
@@ -274,8 +292,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
payload->th.psh = push;
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
- if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
+ tcp_payload_sock_produce(1);
+ if (tcp_payload_sock_used > TCP_FRAMES_MEM - 1) {
tcp_payload_flush(c);
+ }
}
/**
@@ -341,15 +361,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
mh_sock.msg_iovlen = fill_bufs;
}
- if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
+ if (tcp_payload_sock_used + fill_bufs > TCP_FRAMES_MEM) {
tcp_payload_flush(c);
-
- /* Silence Coverity CWE-125 false positive */
- tcp_payload_used = 0;
}
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
- iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
+ iov->iov_base = &tcp_payload[tcp_payload_sock_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
@@ -407,7 +424,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
dlen = mss;
seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) {
- int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
+ int no_csum = i && i != send_bufs - 1 && tcp_payload_sock_used;
bool push = false;
if (i == send_bufs - 1) {
--
@@ -53,7 +53,12 @@ static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516")
/* References tracking the owner connection of frames in the tap outqueue */
static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM];
-static unsigned int tcp_payload_used;
+static unsigned int tcp_payload_sock_used, tcp_payload_tap_used;
+
+static void tcp_payload_sock_produce(size_t n)
+{
+ tcp_payload_sock_used += n;
+}
static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
@@ -132,6 +137,16 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
}
}
+static void tcp_buf_free_old_tap_xmit(void)
+{
+ while (tcp_payload_tap_used) {
+ tap_free_old_xmit(tcp_payload_tap_used);
+
+ tcp_payload_tap_used = 0;
+ tcp_payload_sock_used = 0;
+ }
+}
+
/**
* tcp_payload_flush() - Send out buffers for segments with data or flags
* @c: Execution context
@@ -141,12 +156,13 @@ void tcp_payload_flush(const struct ctx *c)
size_t m;
m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS,
- tcp_payload_used, false);
- if (m != tcp_payload_used) {
+ tcp_payload_sock_used, true);
+ if (m != tcp_payload_sock_used) {
tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m],
- tcp_payload_used - m);
+ tcp_payload_sock_used - m);
}
- tcp_payload_used = 0;
+ tcp_payload_tap_used += m;
+ tcp_buf_free_old_tap_xmit();
}
/**
@@ -195,12 +211,12 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
uint32_t seq;
int ret;
- iov = tcp_l2_iov[tcp_payload_used];
+ iov = tcp_l2_iov[tcp_payload_sock_used];
if (CONN_V4(conn)) {
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else {
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
@@ -211,13 +227,14 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (ret <= 0)
return ret;
- tcp_payload_used++;
+ tcp_payload_sock_produce(1);
l4len = optlen + sizeof(struct tcphdr);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
if (flags & DUP_ACK) {
- struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
+ struct iovec *dup_iov = tcp_l2_iov[tcp_payload_sock_used];
+ tcp_payload_sock_produce(1);
memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base,
iov[TCP_IOV_TAP].iov_len);
@@ -228,8 +245,9 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len;
}
- if (tcp_payload_used > TCP_FRAMES_MEM - 2)
+ if (tcp_payload_sock_used > TCP_FRAMES_MEM - 2) {
tcp_payload_flush(c);
+ }
return 0;
}
@@ -251,19 +269,19 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
struct iovec *iov;
conn->seq_to_tap = seq + dlen;
- tcp_frame_conns[tcp_payload_used] = conn;
- iov = tcp_l2_iov[tcp_payload_used];
+ tcp_frame_conns[tcp_payload_sock_used] = conn;
+ iov = tcp_l2_iov[tcp_payload_sock_used];
if (CONN_V4(conn)) {
if (no_csum) {
- struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1];
+ struct iovec *iov_prev = tcp_l2_iov[tcp_payload_sock_used - 1];
struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
check = &iph->check;
}
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else if (CONN_V6(conn)) {
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]);
iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
payload = iov[TCP_IOV_PAYLOAD].iov_base;
@@ -274,8 +292,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
payload->th.psh = push;
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
- if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
+ tcp_payload_sock_produce(1);
+ if (tcp_payload_sock_used > TCP_FRAMES_MEM - 1) {
tcp_payload_flush(c);
+ }
}
/**
@@ -341,15 +361,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
mh_sock.msg_iovlen = fill_bufs;
}
- if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) {
+ if (tcp_payload_sock_used + fill_bufs > TCP_FRAMES_MEM) {
tcp_payload_flush(c);
-
- /* Silence Coverity CWE-125 false positive */
- tcp_payload_used = 0;
}
for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
- iov->iov_base = &tcp_payload[tcp_payload_used + i].data;
+ iov->iov_base = &tcp_payload[tcp_payload_sock_used + i].data;
iov->iov_len = mss;
}
if (iov_rem)
@@ -407,7 +424,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
dlen = mss;
seq = conn->seq_to_tap;
for (i = 0; i < send_bufs; i++) {
- int no_csum = i && i != send_bufs - 1 && tcp_payload_used;
+ int no_csum = i && i != send_bufs - 1 && tcp_payload_sock_used;
bool push = false;
if (i == send_bufs - 1) {
--
2.50.0
next prev parent reply other threads:[~2025-07-09 17:48 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-09 17:47 [RFC v2 00/11] Add vhost-net kernel support Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 01/11] tap: implement vhost_call_cb Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 02/11] tap: add die() on vhost error Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 03/11] tap: replace tx tap hdr with virtio_nethdr_mrg_rxbuf Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 04/11] tcp: export memory regions to vhost Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 05/11] virtio: Fill .next in tx queue Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 06/11] tap: move static iov_sock to tcp_buf_data_from_sock Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 07/11] tap: support tx through vhost Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 08/11] tap: add tap_free_old_xmit Eugenio Pérez
2025-07-09 17:47 ` Eugenio Pérez [this message]
2025-07-09 17:47 ` [RFC v2 10/11] tap: add poll(2) to used_idx Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 11/11] tcp_buf: adding TCP tx circular buffer Eugenio Pérez
2025-07-10 9:46 ` [RFC v2 00/11] Add vhost-net kernel support Eugenio Perez Martin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250709174748.3514693-10-eperezma@redhat.com \
--to=eperezma@redhat.com \
--cc=jasowang@redhat.com \
--cc=passt-dev@passt.top \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).