On Wed, Jul 09, 2025 at 07:47:46PM +0200, Eugenio Pérez wrote: > The vhost-kernel module is async by nature: the driver (pasta) places a > few buffers in the virtqueue and the device (vhost-kernel) trust the s/trust/trusts/ > driver will not modify them until it uses them. To implement it is not > possible with TCP at the moment, as tcp_buf trust it can reuse the > buffers as soon as tcp_payload_flush() finish. > > To achieve async let's make tcp_buf work with a circular ring, so vhost > can transmit at the same time pasta is queing more data. When a buffer > is received from a TCP socket, the element is placed in the ring and > sock_head is moved: > [][][][] > ^ ^ > | | > | sock_head > | > tail > tap_head > > When the data is sent to vhost through the tx queue, tap_head is moved > forward: > [][][][] > ^ ^ > | | > | sock_head > | tap_head > | > tail > > Finally, the tail move forward when vhost has used the tx buffers, so > tcp_payload (and all lower protocol buffers) can be reused. > [][][][] > ^ > | > sock_head > tap_head > tail This all sounds good. I wonder if it might be clearer to do this circular queue conversion as a separate patch series. I think it makes sense even without the context of vhost (it's closer to how most network things work). > In the case of error queueing to the vhost virtqueue, sock_head moves > backwards. The only possible error is that the queue is full, as sock_head moves backwards? Or tap_head moves backwards? > virtio-net does not report success on packet sending. > > Starting as simple as possible, and only implementing the count > variables in this patch so it keeps working as previously. The circular > behavior will be added on top. > > From ~16BGbit/s to ~13Gbit/s compared with write(2) to the tap. I don't really understand what you're comparing here. > Signed-off-by: Eugenio Pérez > --- > tcp_buf.c | 63 +++++++++++++++++++++++++++++++++++-------------------- > 1 file changed, 40 insertions(+), 23 deletions(-) > > diff --git a/tcp_buf.c b/tcp_buf.c > index 242086d..0437120 100644 > --- a/tcp_buf.c > +++ b/tcp_buf.c > @@ -53,7 +53,12 @@ static_assert(MSS6 <= sizeof(tcp_payload[0].data), "MSS6 is greater than 65516") > > /* References tracking the owner connection of frames in the tap outqueue */ > static struct tcp_tap_conn *tcp_frame_conns[TCP_FRAMES_MEM]; > -static unsigned int tcp_payload_used; > +static unsigned int tcp_payload_sock_used, tcp_payload_tap_used; I think the "payload" here is a hangover from when we had separate queues for flags-only and data-containing packets. We can probably drop it and make a bunch of names shorter. > +static void tcp_payload_sock_produce(size_t n) > +{ > + tcp_payload_sock_used += n; > +} > > static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS]; > > @@ -132,6 +137,16 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, > } > } > > +static void tcp_buf_free_old_tap_xmit(void) > +{ > + while (tcp_payload_tap_used) { > + tap_free_old_xmit(tcp_payload_tap_used); > + > + tcp_payload_tap_used = 0; > + tcp_payload_sock_used = 0; > + } > +} > + > /** > * tcp_payload_flush() - Send out buffers for segments with data or flags > * @c: Execution context > @@ -141,12 +156,13 @@ void tcp_payload_flush(const struct ctx *c) > size_t m; > > m = tap_send_frames(c, &tcp_l2_iov[0][0], TCP_NUM_IOVS, > - tcp_payload_used, false); > - if (m != tcp_payload_used) { > + tcp_payload_sock_used, true); > + if (m != tcp_payload_sock_used) { > tcp_revert_seq(c, &tcp_frame_conns[m], &tcp_l2_iov[m], > - tcp_payload_used - m); > + tcp_payload_sock_used - m); > } > - tcp_payload_used = 0; > + tcp_payload_tap_used += m; > + tcp_buf_free_old_tap_xmit(); > } > > /** > @@ -195,12 +211,12 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) > uint32_t seq; > int ret; > > - iov = tcp_l2_iov[tcp_payload_used]; > + iov = tcp_l2_iov[tcp_payload_sock_used]; > if (CONN_V4(conn)) { > - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]); > + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]); > iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; > } else { > - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); > + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]); > iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; > } > > @@ -211,13 +227,14 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) > if (ret <= 0) > return ret; > > - tcp_payload_used++; > + tcp_payload_sock_produce(1); > l4len = optlen + sizeof(struct tcphdr); > iov[TCP_IOV_PAYLOAD].iov_len = l4len; > tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false); > > if (flags & DUP_ACK) { > - struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++]; > + struct iovec *dup_iov = tcp_l2_iov[tcp_payload_sock_used]; > + tcp_payload_sock_produce(1); > > memcpy(dup_iov[TCP_IOV_TAP].iov_base, iov[TCP_IOV_TAP].iov_base, > iov[TCP_IOV_TAP].iov_len); > @@ -228,8 +245,9 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) > dup_iov[TCP_IOV_PAYLOAD].iov_len = l4len; > } > > - if (tcp_payload_used > TCP_FRAMES_MEM - 2) > + if (tcp_payload_sock_used > TCP_FRAMES_MEM - 2) { > tcp_payload_flush(c); > + } No { } here in passt style. > > return 0; > } > @@ -251,19 +269,19 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > struct iovec *iov; > > conn->seq_to_tap = seq + dlen; > - tcp_frame_conns[tcp_payload_used] = conn; > - iov = tcp_l2_iov[tcp_payload_used]; > + tcp_frame_conns[tcp_payload_sock_used] = conn; > + iov = tcp_l2_iov[tcp_payload_sock_used]; > if (CONN_V4(conn)) { > if (no_csum) { > - struct iovec *iov_prev = tcp_l2_iov[tcp_payload_used - 1]; > + struct iovec *iov_prev = tcp_l2_iov[tcp_payload_sock_used - 1]; > struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base; > > check = &iph->check; > } > - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]); > + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_sock_used]); > iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src; > } else if (CONN_V6(conn)) { > - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]); > + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_sock_used]); > iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src; > } > payload = iov[TCP_IOV_PAYLOAD].iov_base; > @@ -274,8 +292,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > payload->th.psh = push; > iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr); > tcp_l2_buf_fill_headers(conn, iov, check, seq, false); > - if (++tcp_payload_used > TCP_FRAMES_MEM - 1) > + tcp_payload_sock_produce(1); > + if (tcp_payload_sock_used > TCP_FRAMES_MEM - 1) { > tcp_payload_flush(c); > + } > } > > /** > @@ -341,15 +361,12 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > mh_sock.msg_iovlen = fill_bufs; > } > > - if (tcp_payload_used + fill_bufs > TCP_FRAMES_MEM) { > + if (tcp_payload_sock_used + fill_bufs > TCP_FRAMES_MEM) { > tcp_payload_flush(c); > - > - /* Silence Coverity CWE-125 false positive */ > - tcp_payload_used = 0; > } > > for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { > - iov->iov_base = &tcp_payload[tcp_payload_used + i].data; > + iov->iov_base = &tcp_payload[tcp_payload_sock_used + i].data; > iov->iov_len = mss; > } > if (iov_rem) > @@ -407,7 +424,7 @@ int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > dlen = mss; > seq = conn->seq_to_tap; > for (i = 0; i < send_bufs; i++) { > - int no_csum = i && i != send_bufs - 1 && tcp_payload_used; > + int no_csum = i && i != send_bufs - 1 && tcp_payload_sock_used; > bool push = false; > > if (i == send_bufs - 1) { -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson