On Sat, May 11, 2024 at 11:20:07AM -0400, Jon Maloy wrote: > >From linux-6.9.0 the kernel will contain > commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option"). > > This new feature makes is possible to call recv_msg(MSG_PEEK) and make > it start reading data from a given offset set by the SO_PEEK_OFF socket > option. This way, we can avoid repeated reading of already read bytes of > a received message, hence saving read cycles when forwarding TCP > messages in the host->name space direction. > > In this commit, we add functionality to leverage this feature when > available, while we fall back to the previous behavior when not. > > Measurements with iperf3 shows that throughput increases with 15-20 > percent in the host->namespace direction when this feature is used. > > Signed-off-by: Jon Maloy > > --- > v2: - Some smaller changes as suggested by David Gibson and Stefano Brivio. > - Moved initial set_peek_offset(0) to only the locations where the socket is set > to ESTABLISHED. > - Removed the per-packet synchronization between sk_peek_off and > already_sent. Instead only doing it in retransmit situations. > - The problem I found when trouble shooting the occasionally occurring > out of synch values between 'already_sent' and 'sk_peek_offset' may > have deeper implications that we may need to be investigate. > > v3: - Rebased to most recent version of tcp.c, plus the previous > patch in this series. > - Some changes based on feedback from PASST team > --- > tcp.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++----------- > 1 file changed, 63 insertions(+), 14 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 21cbfba..8297812 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -520,6 +520,9 @@ static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS]; > > +/* Does the kernel support TCP_PEEK_OFF? */ > +static bool peek_offset_cap; > + > /* sendmsg() to socket */ > static struct iovec tcp_iov [UIO_MAXIOV]; > > @@ -535,6 +538,20 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, > int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; > int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; > > +/** > + * tcp_set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported > + * @s: Socket to update > + * @offset: Offset in bytes > + */ > +static void tcp_set_peek_offset(int s, int offset) > +{ > + if (!peek_offset_cap) > + return; > + > + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) > + err("Failed to set SO_PEEK_OFF"); > +} > + > /** > * tcp_conn_epoll_events() - epoll events mask for given connection state > * @events: Current connection events > @@ -1280,11 +1297,14 @@ static void tcp_revert_seq(struct tcp_frame_ref *frame_ref, int first, int last) > continue; > > conn->seq_to_tap = frame_ref[i].seq; > + tcp_set_peek_offset(conn->sock, > + conn->seq_to_tap - conn->seq_ack_from_tap); > > if (SEQ_GE(conn->seq_to_tap, conn->seq_ack_from_tap)) > continue; > > conn->seq_to_tap = conn->seq_ack_from_tap; > + tcp_set_peek_offset(conn->sock, 0); > } > } > > @@ -2203,42 +2223,52 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; > int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; > int sendlen, len, dlen, v4 = CONN_V4(conn); > + uint32_t max_send, seq, already_sent; > int s = conn->sock, i, ret = 0; > struct msghdr mh_sock = { 0 }; > uint16_t mss = MSS_GET(conn); > - uint32_t already_sent, seq; > struct iovec *iov; > > + /* How much have we read/sent since last received ack ? */ > already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; > - > if (SEQ_LT(already_sent, 0)) { > /* RFC 761, section 2.1. */ > flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", > conn->seq_ack_from_tap, conn->seq_to_tap); > conn->seq_to_tap = conn->seq_ack_from_tap; > already_sent = 0; > + tcp_set_peek_offset(s, 0); > } > > - if (!wnd_scaled || already_sent >= wnd_scaled) { > + /* How much are we still allowed to send within current window ? */ Does this hunk actually related to the SO_PEEK_OFF change, or does it belong with the zero window fix in the next patch? > + max_send = conn->seq_ack_from_tap + wnd_scaled - conn->seq_to_tap; > + if (SEQ_LE(max_send, 0)) { IIUC, 'max_send' is an difference in sequence numbers, not an absolute sequence number, and should therefore be compared with a regular <= rather than SEQ_LE. Although it looks like you'd also need to invert the sign to make that work, > + flow_trace(conn, "Empty window: win: %u, sent: %u", > + wnd_scaled, conn->seq_to_tap); > conn_flag(c, conn, STALLED); > conn_flag(c, conn, ACK_FROM_TAP_DUE); > return 0; > } > > - /* Set up buffer descriptors we'll fill completely and partially. */ > - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); > + /* Set up buffer descriptors to fill completely or partially. */ > + fill_bufs = DIV_ROUND_UP(max_send, mss); > if (fill_bufs > TCP_FRAMES) { > fill_bufs = TCP_FRAMES; > iov_rem = 0; > } else { > - iov_rem = (wnd_scaled - already_sent) % mss; > + iov_rem = max_send % mss; > } > > - mh_sock.msg_iov = iov_sock; > - mh_sock.msg_iovlen = fill_bufs + 1; > - > - iov_sock[0].iov_base = tcp_buf_discard; > - iov_sock[0].iov_len = already_sent; > + /* Prepare iov according to kernel capability */ > + if (!peek_offset_cap) { > + mh_sock.msg_iov = iov_sock; > + iov_sock[0].iov_base = tcp_buf_discard; > + iov_sock[0].iov_len = already_sent; > + mh_sock.msg_iovlen = fill_bufs + 1; > + } else { > + mh_sock.msg_iov = &iov_sock[1]; > + mh_sock.msg_iovlen = fill_bufs; > + } > > if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) || > (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) { > @@ -2279,7 +2309,10 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > return 0; > } > > - sendlen = len - already_sent; > + sendlen = len; > + if (!peek_offset_cap) > + sendlen -= already_sent; > + > if (sendlen <= 0) { > conn_flag(c, conn, STALLED); > return 0; > @@ -2449,7 +2482,9 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > flow_trace(conn, > "fast re-transmit, ACK: %u, previous sequence: %u", > max_ack_seq, conn->seq_to_tap); > + > conn->seq_to_tap = max_ack_seq; > + tcp_set_peek_offset(conn->sock, 0); > tcp_data_from_sock(c, conn); > } > > @@ -2542,6 +2577,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, > conn->seq_ack_to_tap = conn->seq_from_tap; > > conn_event(c, conn, ESTABLISHED); > + tcp_set_peek_offset(conn->sock, 0); > > /* The client might have sent data already, which we didn't > * dequeue waiting for SYN,ACK from tap -- check now. > @@ -2622,6 +2658,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, > goto reset; > > conn_event(c, conn, ESTABLISHED); > + tcp_set_peek_offset(conn->sock, 0); > > if (th->fin) { > conn->seq_from_tap++; > @@ -2788,7 +2825,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, > union sockaddr_inany sa; > socklen_t sl = sizeof(sa); > union flow *flow; > - int s; > + int s = 0; This appears unrelated to the rest. Also wrong, since stdin is not a reasonable initial value for this fd. > if (c->no_tcp || !(flow = flow_alloc())) > return; > @@ -2875,6 +2912,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) > flow_dbg(conn, "ACK timeout, retry"); > conn->retrans++; > conn->seq_to_tap = conn->seq_ack_from_tap; > + tcp_set_peek_offset(conn->sock, 0); > tcp_data_from_sock(c, conn); > tcp_timer_ctl(c, conn); > } > @@ -3166,7 +3204,8 @@ static void tcp_sock_refill_init(const struct ctx *c) > */ > int tcp_init(struct ctx *c) > { > - unsigned b; > + unsigned int b, optv = 0; > + int s; > > for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) > tc_hash[b] = FLOW_SIDX_NONE; > @@ -3190,6 +3229,16 @@ int tcp_init(struct ctx *c) > NS_CALL(tcp_ns_socks_init, c); > } > > + /* Probe for SO_PEEK_OFF support */ > + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); > + if (s < 0) { > + warn("Temporary TCP socket creation failed"); > + } else { > + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &optv, sizeof(int))) > + peek_offset_cap = true; > + close(s); > + } > + info("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); > return 0; > } > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson