On Thu, Feb 01, 2024 at 05:12:11PM -0500, Jon Maloy wrote: > The kernel may support recvmsg(MSG_PEEK), starting reading data from a > given offset set by the SO_PEEK_OFF socket option. This makes it > possible to avoid repeated reading of already read initial bytes of a > received message, hence saving read cycles when forwarding TCP messages > in the host->name space direction. > > In this commit, we add functionality to leverage this feature when available, > while we fall back to the previous behavior when not. > > Measurements with iperf3 shows that throughput increases with 15-20 percent > in the host->namespace direction when this feature is used. > > Signed-off-by: Jon Maloy > --- > tcp.c | 35 +++++++++++++++++++++++++++++++++-- > 1 file changed, 33 insertions(+), 2 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 905d26f..58674eb 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; > static unsigned int tcp6_l2_buf_used; > > /* recvmsg()/sendmsg() data for tap */ > +static bool peek_offset_cap = false; > static char tcp_buf_discard [MAX_WINDOW]; > static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; > > @@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, > int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; > int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; > A function comnment would be nice. > +static void set_peek_offset(int s, int offset) > +{ > + if (!peek_offset_cap) > + return; > + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) > + perror("Failed to set SO_PEEK_OFF\n"); > +} > + > /** > * tcp_conn_epoll_events() - epoll events mask for given connection state > * @events: Current connection events > @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c, > if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) > goto cancel; > } > - > + set_peek_offset(s, 0); > conn = &flow->tcp; > conn->f.type = FLOW_TCP; > conn->sock = s; > @@ -2174,6 +2183,12 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > if (iov_rem) > iov_sock[fill_bufs].iov_len = iov_rem; > > + /* Don't use discard buffer if SO_PEEK_OFF is supported. */ > + if (peek_offset_cap) { > + mh_sock.msg_iov = &iov_sock[1]; > + mh_sock.msg_iovlen -= 1; > + } > + > /* Receive into buffers, don't dequeue until acknowledged by guest. */ > do > len = recvmsg(s, &mh_sock, MSG_PEEK); > @@ -2195,7 +2210,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > return 0; > } > > - sendlen = len - already_sent; > + sendlen = len; > + if (!peek_offset_cap) > + sendlen -= already_sent; > if (sendlen <= 0) { > conn_flag(c, conn, STALLED); > return 0; > @@ -2367,6 +2384,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > max_ack_seq, conn->seq_to_tap); > conn->seq_ack_from_tap = max_ack_seq; > conn->seq_to_tap = max_ack_seq; > + set_peek_offset(conn->sock, 0); I don't think a zero offset is entirely correct here. Instead I think it should be (max_ack_seq - conn->seq_ack_from_tap). We're retransmitting from the segment immediately after max_ack_seq, so we want that offset, minus the offset of the the "TRUNC" pointer in the stream, which is conn->seq_ack_from_tap. Now, usually that will be 0, because tcp_sock_consume() just above will have advanced the TRUNC pointer up to max_ack_seq, so tcp_update_seqack_from_tap() will have set conn->seq_ack_from_tap equal to max_ack_seq. However, tcp_sock_consume() can fail, in which case neither the TRUNC pointer nor conn->seq_ack_from_tap will be advanced. > tcp_data_from_sock(c, conn); > } > > @@ -2718,6 +2736,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, > tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, > s, (struct sockaddr *)&sa)) > return; > + set_peek_offset(s, 0); Since this is specific to "tap" connections, I think this would be better moved into tcp_tap_conn_from_sock(). > tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, > (struct sockaddr *)&sa, now); > @@ -3042,6 +3061,7 @@ static void tcp_sock_refill_init(const struct ctx *c) > int tcp_init(struct ctx *c) > { > unsigned b; > + int s; > > for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) > tc_hash[b] = FLOW_SIDX_NONE; > @@ -3065,6 +3085,17 @@ int tcp_init(struct ctx *c) > NS_CALL(tcp_ns_socks_init, c); > } > > + /* Probe for SO_PEEK_OFF support */ > + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); > + if (s < 0) { > + perror("Temporary tcp socket creation failed\n"); > + } else { > + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) { > + peek_offset_cap = true; > + } > + close(s); > + } > + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not "); > return 0; > } I think you're also missing one call to set_peek_offset(). You've got a call for the fast re-transmit case (duplicate ack), but not for the "slow retransmit" case (ack timeout), which is in tcp_timer_handler(). *thinks* the peek pointer basically needs to always be kept in sync with conn->seq_to_tap. When seq_to_tap is advanced because we've peeked more data, the kernel should automatically update the peek pointer to match. But any other place we adjust seq_to_tap probably needs a setsockopt(). That looks to be the "slow" retransmit case in tcp_timer_handler(), but also the "ack sequence gap" bit at the top of tcp_data_from_sock(). I'm not really sure what that case is about, but I think we need to adjust SO_PEEK_OFF. I wonder if it would be safe to have a helper function that adjusts both seq_to_tap and SO_PEEK_OFF. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson