On Tue, Sep 17, 2024 at 11:54:28PM +0200, Stefano Brivio wrote: > On Fri, 13 Sep 2024 14:32:06 +1000 > David Gibson wrote: > > > When available, we want to retrieve our socket peer's advertised window and > > forward that to the guest. That information has been available from the > > kernel via the TCP_INFO getsockopt() since kernel commit 8f7baad7f035. > > > > Currently our probing for this is a bit odd. The HAS_SND_WND define > > determines if our headers include the tcp_snd_wnd field, but that doesn't > > necessarily mean the running kernel supports it. Currently we start by > > assuming it's _not_ available, but mark it as available if we ever see > > a non-zero value in the field. This is a bit hit and miss in two ways: > > * Zero is perfectly possible window the peer could report, so we can > > get false negatives > > Kind of: one non-zero result was enough to set tcp.kernel_snd_wnd to > one. Right, but that in turn means it could change at some later point rather than being correct from the beginning, which (slightly) complicates everything. > The reason why I implemented it that way was to account for possible > kernel backports of that option. On the other hand, any kernel backport > would need to preserve the position of tcpi_snd_wnd, regardless of > whether preceding fields are missing, so checking the size as you're > doing also looks robust, and avoids these two issues altogether. Right. My understanding is that the ability to check the returned size is exactly why adding more fields to a getsockopt() is considered a backwards compatible change. > > > * We're reading TCP_INFO into a local variable, which might not be zero > > initialised, so if the kernel _doesn't_ write it it could have non-zero > > garbage, giving us false positives. > > > > We can use a more direct way of probing for this: getsockopt() reports the > > length of the information retreived. So, check whether that's long enough > > to include the field. This lets us probe the availability of the field > > once and for all during initialisation. That in turn allows ctx to become > > a const pointer to tcp_prepare_flags() which cascades through many other > > functions. > > > > We also move the flag for the probe result from the ctx structure to a > > global, to match peek_offset_cap. > > > > Signed-off-by: David Gibson > > --- > > tcp.c | 93 ++++++++++++++++++++++++++++++++++++-------------- > > tcp.h | 13 +++---- > > tcp_buf.c | 10 +++--- > > tcp_buf.h | 6 ++-- > > tcp_internal.h | 4 +-- > > 5 files changed, 82 insertions(+), 44 deletions(-) > > > > diff --git a/tcp.c b/tcp.c > > index 14b48a84..cba3f3bd 100644 > > --- a/tcp.c > > +++ b/tcp.c > > @@ -308,11 +308,6 @@ > > /* MSS rounding: see SET_MSS() */ > > #define MSS_DEFAULT 536 > > #define WINDOW_DEFAULT 14600 /* RFC 6928 */ > > -#ifdef HAS_SND_WND > > -# define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd) > > -#else > > -# define KERNEL_REPORTS_SND_WND(c) (0 && (c)) > > -#endif > > > > #define ACK_INTERVAL 10 /* ms */ > > #define SYN_TIMEOUT 10 /* s */ > > @@ -370,6 +365,14 @@ char tcp_buf_discard [MAX_WINDOW]; > > > > /* Does the kernel support TCP_PEEK_OFF? */ > > bool peek_offset_cap; > > +#ifdef HAS_SND_WND > > +/* Does the kernel report sending window in TCP_INFO (kernel commit > > + * 8f7baad7f035) > > + */ > > +bool snd_wnd_cap; > > +#else > > +#define snd_wnd_cap (false) > > +#endif > > > > /* sendmsg() to socket */ > > static struct iovec tcp_iov [UIO_MAXIOV]; > > @@ -1052,7 +1055,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, > > } > > #endif /* !HAS_BYTES_ACKED */ > > > > - if (!KERNEL_REPORTS_SND_WND(c)) { > > + if (!snd_wnd_cap) { > > tcp_get_sndbuf(conn); > > new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW); > > conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap, > > @@ -1136,7 +1139,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, > > * 0 if there is no flag to send > > * 1 otherwise > > */ > > -int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, > > +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, > > int flags, struct tcphdr *th, char *data, > > size_t *optlen) > > { > > @@ -1153,11 +1156,6 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, > > return -ECONNRESET; > > } > > > > -#ifdef HAS_SND_WND > > - if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd) > > - c->tcp.kernel_snd_wnd = 1; > > -#endif > > - > > if (!(conn->flags & LOCAL)) > > tcp_rtt_dst_check(conn, &tinfo); > > > > @@ -1235,7 +1233,8 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, > > * > > * Return: negative error code on connection reset, 0 otherwise > > */ > > -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > > +static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, > > + int flags) > > { > > return tcp_buf_send_flag(c, conn, flags); > > } > > @@ -1245,7 +1244,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > > * @c: Execution context > > * @conn: Connection pointer > > */ > > -void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) > > +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn) > > { > > if (conn->events == CLOSED) > > return; > > @@ -1463,7 +1462,7 @@ static void tcp_bind_outbound(const struct ctx *c, > > * @optlen: Bytes in options: caller MUST ensure available length > > * @now: Current timestamp > > */ > > -static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, > > +static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, > > const void *saddr, const void *daddr, > > const struct tcphdr *th, const char *opts, > > size_t optlen, const struct timespec *now) > > @@ -1628,7 +1627,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) > > * > > * #syscalls recvmsg > > */ > > -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > +static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > > { > > return tcp_buf_data_from_sock(c, conn); > > } > > @@ -1644,8 +1643,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > * > > * Return: count of consumed packets > > */ > > -static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > > - const struct pool *p, int idx) > > +static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn, > > + const struct pool *p, int idx) > > { > > int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0; > > uint16_t max_ack_seq_wnd = conn->wnd_from_tap; > > @@ -1842,7 +1841,8 @@ out: > > * @opts: Pointer to start of options > > * @optlen: Bytes in options: caller MUST ensure available length > > */ > > -static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, > > +static void tcp_conn_from_sock_finish(const struct ctx *c, > > + struct tcp_tap_conn *conn, > > const struct tcphdr *th, > > const char *opts, size_t optlen) > > { > > @@ -1885,7 +1885,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, > > * > > * Return: count of consumed packets > > */ > > -int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, > > +int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, > > const void *saddr, const void *daddr, > > const struct pool *p, int idx, const struct timespec *now) > > { > > @@ -2023,7 +2023,7 @@ reset: > > * @c: Execution context > > * @conn: Connection pointer > > */ > > -static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) > > +static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn) > > { > > socklen_t sl; > > int so; > > @@ -2049,8 +2049,8 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) > > * @sa: Peer socket address (from accept()) > > * @now: Current timestamp > > */ > > -static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s, > > - const struct timespec *now) > > +static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow, > > + int s, const struct timespec *now) > > { > > struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); > > uint64_t hash; > > @@ -2081,7 +2081,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s, > > * @ref: epoll reference of listening socket > > * @now: Current timestamp > > */ > > -void tcp_listen_handler(struct ctx *c, union epoll_ref ref, > > +void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, > > const struct timespec *now) > > { > > const struct flowside *ini; > > @@ -2146,7 +2146,7 @@ cancel: > > * > > * #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64 > > */ > > -void tcp_timer_handler(struct ctx *c, union epoll_ref ref) > > +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref) > > { > > struct itimerspec check_armed = { { 0 }, { 0 } }; > > struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp; > > @@ -2210,7 +2210,8 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) > > * @ref: epoll reference > > * @events: epoll events bitmap > > */ > > -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) > > +void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, > > + uint32_t events) > > { > > struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside); > > > > @@ -2494,6 +2495,40 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af) > > return ret; > > } > > > > +#ifdef HAS_SND_WND > > +/** > > + * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd > > + * > > + * Return: true if supported, false otherwise > > + */ > > +static bool tcp_probe_snd_wnd_cap(void) > > +{ > > + struct tcp_info tinfo; > > + socklen_t sl = sizeof(tinfo); > > + int s; > > + > > + s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP); > > + if (s < 0) { > > + warn_perror("Temporary TCP socket creation failed"); > > + return false; > > + } > > + > > + if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) { > > + warn_perror("Failed to get TCP_INFO on temporary socket"); > > + close(s); > > + return false; > > + } > > + > > + close(s); > > + > > + if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) + > > + sizeof(tinfo.tcpi_snd_wnd))) > > + return false; > > + > > + return true; > > +} > > +#endif /* HAS_SND_WND */ > > + > > /** > > * tcp_init() - Get initial sequence, hash secret, initialise per-socket data > > * @c: Execution context > > @@ -2527,6 +2562,12 @@ int tcp_init(struct ctx *c) > > (!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6)); > > debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not "); > > > > +#ifdef HAS_SND_WND > > + snd_wnd_cap = tcp_probe_snd_wnd_cap(); > > +#endif > > + debug("TCP_INFO tcpi_snd_wnd field%ssupported", > > + snd_wnd_cap ? " " : " not "); > > + > > return 0; > > } > > > > diff --git a/tcp.h b/tcp.h > > index e9ff0191..5585924f 100644 > > --- a/tcp.h > > +++ b/tcp.h > > @@ -10,11 +10,12 @@ > > > > struct ctx; > > > > -void tcp_timer_handler(struct ctx *c, union epoll_ref ref); > > -void tcp_listen_handler(struct ctx *c, union epoll_ref ref, > > +void tcp_timer_handler(const struct ctx *c, union epoll_ref ref); > > +void tcp_listen_handler(const struct ctx *c, union epoll_ref ref, > > const struct timespec *now); > > -void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events); > > -int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af, > > +void tcp_sock_handler(const struct ctx *c, union epoll_ref ref, > > + uint32_t events); > > +int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, > > const void *saddr, const void *daddr, > > const struct pool *p, int idx, const struct timespec *now); > > int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr, > > @@ -58,16 +59,12 @@ union tcp_listen_epoll_ref { > > * @fwd_in: Port forwarding configuration for inbound packets > > * @fwd_out: Port forwarding configuration for outbound packets > > * @timer_run: Timestamp of most recent timer run > > - * @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035) > > * @pipe_size: Size of pipes for spliced connections > > */ > > struct tcp_ctx { > > struct fwd_ports fwd_in; > > struct fwd_ports fwd_out; > > struct timespec timer_run; > > -#ifdef HAS_SND_WND > > - int kernel_snd_wnd; > > -#endif > > size_t pipe_size; > > }; > > > > diff --git a/tcp_buf.c b/tcp_buf.c > > index 1a398461..c886c92b 100644 > > --- a/tcp_buf.c > > +++ b/tcp_buf.c > > @@ -239,7 +239,7 @@ void tcp_flags_flush(const struct ctx *c) > > * @frames: Two-dimensional array containing queued frames with sub-iovs > > * @num_frames: Number of entries in the two arrays to be compared > > */ > > -static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns, > > +static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns, > > struct iovec (*frames)[TCP_NUM_IOVS], int num_frames) > > { > > int i; > > @@ -264,7 +264,7 @@ static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns, > > * tcp_payload_flush() - Send out buffers for segments with data > > * @c: Execution context > > */ > > -void tcp_payload_flush(struct ctx *c) > > +void tcp_payload_flush(const struct ctx *c) > > { > > size_t m; > > > > @@ -293,7 +293,7 @@ void tcp_payload_flush(struct ctx *c) > > * > > * Return: negative error code on connection reset, 0 otherwise > > */ > > -int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > > +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags) > > { > > struct tcp_flags_t *payload; > > struct iovec *iov; > > @@ -361,7 +361,7 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > > * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer > > * @seq: Sequence number to be sent > > */ > > -static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, > > +static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > > ssize_t dlen, int no_csum, uint32_t seq) > > { > > struct iovec *iov; > > @@ -405,7 +405,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, > > * > > * #syscalls recvmsg > > */ > > -int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn) > > { > > uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; > > int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; > > diff --git a/tcp_buf.h b/tcp_buf.h > > index 3db4c56e..8d4b615a 100644 > > --- a/tcp_buf.h > > +++ b/tcp_buf.h > > @@ -9,8 +9,8 @@ > > void tcp_sock4_iov_init(const struct ctx *c); > > void tcp_sock6_iov_init(const struct ctx *c); > > void tcp_flags_flush(const struct ctx *c); > > -void tcp_payload_flush(struct ctx *c); > > -int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); > > -int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); > > +void tcp_payload_flush(const struct ctx *c); > > +int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn); > > +int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags); > > > > #endif /*TCP_BUF_H */ > > diff --git a/tcp_internal.h b/tcp_internal.h > > index aa8bb64f..bd634be1 100644 > > --- a/tcp_internal.h > > +++ b/tcp_internal.h > > @@ -82,7 +82,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn, > > conn_event_do(c, conn, event); \ > > } while (0) > > > > -void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); > > +void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn); > > #define tcp_rst(c, conn) \ > > do { \ > > flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \ > > @@ -94,7 +94,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, > > const uint16_t *check, uint32_t seq); > > int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, > > int force_seq, struct tcp_info *tinfo); > > -int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags, > > +int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags, > > struct tcphdr *th, char *data, size_t *optlen); > > > > #endif /* TCP_INTERNAL_H */ > -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson