On Fri, Sep 29, 2023 at 05:04:46PM +0200, Stefano Brivio wrote: > ...so that we'll retry sending them, instead of more-or-less silently > dropping them. This happens quite frequently if our sending buffer on > the UNIX domain socket is heavily constrained (for instance, by the > 208 KiB default memory limit). > > It might be argued that dropping frames is part of the expected TCP > flow: we don't dequeue those from the socket anyway, so we'll > eventually retransmit them. > > But we don't need the receiver to tell us (by the way of duplicate or > missing ACKs) that we couldn't send them: we already know as > sendmsg() reports that. This seems to considerably increase > throughput stability and throughput itself for TCP connections with > default wmem_max values. > > Unfortunately, the 16 bits left as padding in the frame descriptors > we use internally aren't enough to uniquely identify for which > connection we should update sequence numbers: create a parallel > array of pointers to sequence numbers and L4 lengths, of > TCP_FRAMES_MEM size, and go through it after calling sendmsg(). > > Signed-off-by: Stefano Brivio Reviewed-by: David Gibson > --- > tap.c | 10 +++++++--- > tap.h | 2 +- > tcp.c | 40 ++++++++++++++++++++++++++++++++++------ > 3 files changed, 42 insertions(+), 10 deletions(-) > > diff --git a/tap.c b/tap.c > index 93db989..b30ff81 100644 > --- a/tap.c > +++ b/tap.c > @@ -413,13 +413,15 @@ static size_t tap_send_frames_passt(const struct ctx *c, > * @c: Execution context > * @iov: Array of buffers, each containing one frame (with L2 headers) > * @n: Number of buffers/frames in @iov > + * > + * Return: number of frames actually sent > */ > -void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n) > +size_t tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n) > { > size_t m; > > if (!n) > - return; > + return 0; > > if (c->mode == MODE_PASST) > m = tap_send_frames_passt(c, iov, n); > @@ -427,9 +429,11 @@ void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n) > m = tap_send_frames_pasta(c, iov, n); > > if (m < n) > - debug("tap: dropped %lu frames of %lu due to short send", n - m, n); > + debug("tap: failed to send %lu frames of %lu", n - m, n); > > pcap_multiple(iov, m, c->mode == MODE_PASST ? sizeof(uint32_t) : 0); > + > + return m; > } > > /** > diff --git a/tap.h b/tap.h > index 021fb7c..952fafc 100644 > --- a/tap.h > +++ b/tap.h > @@ -73,7 +73,7 @@ void tap_icmp6_send(const struct ctx *c, > const struct in6_addr *src, const struct in6_addr *dst, > void *in, size_t len); > int tap_send(const struct ctx *c, const void *data, size_t len); > -void tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n); > +size_t tap_send_frames(struct ctx *c, const struct iovec *iov, size_t n); > void tap_update_mac(struct tap_hdr *taph, > const unsigned char *eth_d, const unsigned char *eth_s); > void tap_listen_handler(struct ctx *c, uint32_t events); > diff --git a/tcp.c b/tcp.c > index 32917c8..3313d72 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -434,6 +434,16 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS]; > */ > static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE]; > > +/** > + * tcp_buf_seq_update - Sequences to update with length of frames once sent > + * @seq: Pointer to sequence number sent to tap-side, to be updated > + * @len: TCP payload length > + */ > +struct tcp_buf_seq_update { > + uint32_t *seq; > + uint16_t len; > +}; > + > /* Static buffers */ > > /** > @@ -462,6 +472,8 @@ static struct tcp4_l2_buf_t { > #endif > tcp4_l2_buf[TCP_FRAMES_MEM]; > > +static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; > + > static unsigned int tcp4_l2_buf_used; > > /** > @@ -490,6 +502,8 @@ struct tcp6_l2_buf_t { > #endif > tcp6_l2_buf[TCP_FRAMES_MEM]; > > +static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; > + > static unsigned int tcp6_l2_buf_used; > > /* recvmsg()/sendmsg() data for tap */ > @@ -1369,10 +1383,17 @@ static void tcp_l2_flags_buf_flush(struct ctx *c) > */ > static void tcp_l2_data_buf_flush(struct ctx *c) > { > - tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); > + unsigned i; > + size_t m; > + > + m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); > + for (i = 0; i < m; i++) > + *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; > tcp6_l2_buf_used = 0; > > - tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); > + m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); > + for (i = 0; i < m; i++) > + *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; > tcp4_l2_buf_used = 0; > } > > @@ -2149,17 +2170,20 @@ static int tcp_sock_consume(struct tcp_tap_conn *conn, uint32_t ack_seq) > * @plen: Payload length at L4 > * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer > * @seq: Sequence number to be sent > - * @now: Current timestamp > */ > static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, > ssize_t plen, int no_csum, uint32_t seq) > { > + uint32_t *seq_update = &conn->seq_to_tap; > struct iovec *iov; > > if (CONN_V4(conn)) { > struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; > uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; > > + tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; > + tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; > + > iov = tcp4_l2_iov + tcp4_l2_buf_used++; > iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, > check, seq); > @@ -2168,6 +2192,9 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, > } else if (CONN_V6(conn)) { > struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; > > + tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; > + tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; > + > iov = tcp6_l2_iov + tcp6_l2_buf_used++; > iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, > NULL, seq); > @@ -2193,7 +2220,7 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > int s = conn->sock, i, ret = 0; > struct msghdr mh_sock = { 0 }; > uint16_t mss = MSS_GET(conn); > - uint32_t already_sent; > + uint32_t already_sent, seq; > struct iovec *iov; > > already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; > @@ -2282,14 +2309,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > /* Finally, queue to tap */ > plen = mss; > + seq = conn->seq_to_tap; > for (i = 0; i < send_bufs; i++) { > int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; > > if (i == send_bufs - 1) > plen = last_len; > > - tcp_data_to_tap(c, conn, plen, no_csum, conn->seq_to_tap); > - conn->seq_to_tap += plen; > + tcp_data_to_tap(c, conn, plen, no_csum, seq); > + seq += plen; > } > > conn_flag(c, conn, ACK_FROM_TAP_DUE); -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson