On Fri, May 17, 2024 at 11:24:13AM -0400, Jon Maloy wrote: > >From linux-6.9.0 the kernel will contain > commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option"). > > This new feature makes is possible to call recv_msg(MSG_PEEK) and make > it start reading data from a given offset set by the SO_PEEK_OFF socket > option. This way, we can avoid repeated reading of already read bytes of > a received message, hence saving read cycles when forwarding TCP > messages in the host->name space direction. > > In this commit, we add functionality to leverage this feature when > available, while we fall back to the previous behavior when not. > > Measurements with iperf3 shows that throughput increases with 15-20 > percent in the host->namespace direction when this feature is used. > > Signed-off-by: Jon Maloy Reviewed-by: David Gibson > > --- > v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio. > - Moved initial set_peek_offset(0) to only the locations where the socket is set > to ESTABLISHED. > - Removed the per-packet synchronization between sk_peek_off and > already_sent. Instead only doing it in retransmit situations. > - The problem I found when trouble shooting the occasionally occurring > out of synch values between 'already_sent' and 'sk_peek_offset' may > have deeper implications that we may need to be investigate. > > v3: - Rebased to most recent version of tcp.c, plus the previous > patch in this series. > - Some changes based on feedback from PASST team > > v4: - Some small changes based on feedback from Stefan/David. > > v5: - Re-added accidentally dropped set_peek_offset() line. > Thank you, David. > --- > tcp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 51 insertions(+), 8 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 3a2350a..fa13292 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -511,6 +511,9 @@ static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > > +/* Does the kernel support TCP_PEEK_OFF? */ > +static bool peek_offset_cap; > + > /* sendmsg() to socket */ > static struct iovec tcp_iov [UIO_MAXIOV]; > > @@ -526,6 +529,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, > int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; > int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; > > +/** > + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported > + * @s: Socket to update > + * @offset: Offset in bytes > + */ > +static void tcp_set_peek_offset(int s, int offset) > +{ > + if (!peek_offset_cap) > + return; > + > + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) > + err("Failed to set SO_PEEK_OFF to %i in socket %i", offset, s); > +} > + > /** > * tcp_conn_epoll_events() - epoll events mask for given connection state > * @events: Current connection events > @@ -1273,6 +1290,7 @@ static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[T > continue; > > conn->seq_to_tap = seq; > + tcp_set_peek_offset(conn->sock, seq - conn->seq_ack_from_tap); > } > } > > @@ -2199,14 +2217,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > uint32_t already_sent, seq; > struct iovec *iov; > > + /* How much have we read/sent since last received ack ? */ > already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; > - > if (SEQ_LT(already_sent, 0)) { > /* RFC 761, section 2.1. */ > flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", > conn->seq_ack_from_tap, conn->seq_to_tap); > conn->seq_to_tap = conn->seq_ack_from_tap; > already_sent = 0; > + tcp_set_peek_offset(s, 0); > } > > if (!wnd_scaled || already_sent >= wnd_scaled) { > @@ -2224,11 +2243,16 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > iov_rem = (wnd_scaled - already_sent) % mss; > } > > - mh_sock.msg_iov = iov_sock; > - mh_sock.msg_iovlen = fill_bufs + 1; > - > - iov_sock[0].iov_base = tcp_buf_discard; > - iov_sock[0].iov_len = already_sent; > + /* Prepare iov according to kernel capability */ > + if (!peek_offset_cap) { > + mh_sock.msg_iov = iov_sock; > + iov_sock[0].iov_base = tcp_buf_discard; > + iov_sock[0].iov_len = already_sent; > + mh_sock.msg_iovlen = fill_bufs + 1; > + } else { > + mh_sock.msg_iov = &iov_sock[1]; > + mh_sock.msg_iovlen = fill_bufs; > + } > > if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || > (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { > @@ -2269,7 +2293,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > return 0; > } > > - sendlen = len - already_sent; > + sendlen = len; > + if (!peek_offset_cap) > + sendlen -= already_sent; > + > if (sendlen <= 0) { > conn_flag(c, conn, STALLED); > return 0; > @@ -2440,6 +2467,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > "fast re-transmit, ACK: %u, previous sequence: %u", > max_ack_seq, conn->seq_to_tap); > conn->seq_to_tap = max_ack_seq; > + tcp_set_peek_offset(conn->sock, 0); > tcp_data_from_sock(c, conn); > } > > @@ -2532,6 +2560,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, > conn->seq_ack_to_tap = conn->seq_from_tap; > > conn_event(c, conn, ESTABLISHED); > + tcp_set_peek_offset(conn->sock, 0); > > /* The client might have sent data already, which we didn't > * dequeue waiting for SYN,ACK from tap -- check now. > @@ -2612,6 +2641,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, > goto reset; > > conn_event(c, conn, ESTABLISHED); > + tcp_set_peek_offset(conn->sock, 0); > > if (th->fin) { > conn->seq_from_tap++; > @@ -2865,6 +2895,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) > flow_dbg(conn, "ACK timeout, retry"); > conn->retrans++; > conn->seq_to_tap = conn->seq_ack_from_tap; > + tcp_set_peek_offset(conn->sock, 0); > tcp_data_from_sock(c, conn); > tcp_timer_ctl(c, conn); > } > @@ -3156,7 +3187,8 @@ static void tcp_sock_refill_init(const struct ctx *c) > */ > int tcp_init(struct ctx *c) > { > - unsigned b; > + unsigned int b, optv = 0; > + int s; > > for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) > tc_hash[b] = FLOW_SIDX_NONE; > @@ -3180,6 +3212,17 @@ int tcp_init(struct ctx *c) > NS_CALL(tcp_ns_socks_init, c); > } > > + /* Probe for SO_PEEK_OFF support */ > + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); > + if (s < 0) { > + warn("Temporary TCP socket creation failed"); > + } else { > + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) > + peek_offset_cap = true; > + close(s); > + } > + info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); > + > return 0; > } > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson