From: David Gibson <david@gibson.dropbear.id.au>
To: passt-dev@passt.top, Stefano Brivio <sbrivio@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH 2/4] tcp: Clean up tcpi_snd_wnd probing
Date: Wed, 18 Sep 2024 11:53:05 +1000 [thread overview]
Message-ID: <20240918015307.1492846-3-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20240918015307.1492846-1-david@gibson.dropbear.id.au>
When available, we want to retrieve our socket peer's advertised window and
forward that to the guest. That information has been available from the
kernel via the TCP_INFO getsockopt() since kernel commit 8f7baad7f035.
Currently our probing for this is a bit odd. The HAS_SND_WND define
determines if our headers include the tcp_snd_wnd field, but that doesn't
necessarily mean the running kernel supports it. Currently we start by
assuming it's _not_ available, but mark it as available if we ever see
a non-zero value in the field. This is a bit hit and miss in two ways:
* Zero is perfectly possible window the peer could report, so we can
get false negatives
* We're reading TCP_INFO into a local variable, which might not be zero
initialised, so if the kernel _doesn't_ write it it could have non-zero
garbage, giving us false positives.
We can use a more direct way of probing for this: getsockopt() reports the
length of the information retreived. So, check whether that's long enough
to include the field. This lets us probe the availability of the field
once and for all during initialisation. That in turn allows ctx to become
a const pointer to tcp_prepare_flags() which cascades through many other
functions.
We also move the flag for the probe result from the ctx structure to a
global, to match peek_offset_cap.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
tcp.c | 93 ++++++++++++++++++++++++++++++++++++--------------
tcp.h | 13 +++----
tcp_buf.c | 10 +++---
tcp_buf.h | 6 ++--
tcp_internal.h | 4 +--
5 files changed, 82 insertions(+), 44 deletions(-)
diff --git a/tcp.c b/tcp.c
index 14b48a84..cba3f3bd 100644
--- a/tcp.c
+++ b/tcp.c
@@ -308,11 +308,6 @@
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
-#ifdef HAS_SND_WND
-# define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd)
-#else
-# define KERNEL_REPORTS_SND_WND(c) (0 && (c))
-#endif
#define ACK_INTERVAL 10 /* ms */
#define SYN_TIMEOUT 10 /* s */
@@ -370,6 +365,14 @@ char tcp_buf_discard [MAX_WINDOW];
/* Does the kernel support TCP_PEEK_OFF? */
bool peek_offset_cap;
+#ifdef HAS_SND_WND
+/* Does the kernel report sending window in TCP_INFO (kernel commit
+ * 8f7baad7f035)
+ */
+bool snd_wnd_cap;
+#else
+#define snd_wnd_cap (false)
+#endif
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@@ -1052,7 +1055,7 @@ int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
}
#endif /* !HAS_BYTES_ACKED */
- if (!KERNEL_REPORTS_SND_WND(c)) {
+ if (!snd_wnd_cap) {
tcp_get_sndbuf(conn);
new_wnd_to_tap = MIN(SNDBUF_GET(conn), MAX_WINDOW);
conn->wnd_to_tap = MIN(new_wnd_to_tap >> conn->ws_to_tap,
@@ -1136,7 +1139,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
* 0 if there is no flag to send
* 1 otherwise
*/
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, char *data,
size_t *optlen)
{
@@ -1153,11 +1156,6 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
return -ECONNRESET;
}
-#ifdef HAS_SND_WND
- if (!c->tcp.kernel_snd_wnd && tinfo.tcpi_snd_wnd)
- c->tcp.kernel_snd_wnd = 1;
-#endif
-
if (!(conn->flags & LOCAL))
tcp_rtt_dst_check(conn, &tinfo);
@@ -1235,7 +1233,8 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
*
* Return: negative error code on connection reset, 0 otherwise
*/
-static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+static int tcp_send_flag(const struct ctx *c, struct tcp_tap_conn *conn,
+ int flags)
{
return tcp_buf_send_flag(c, conn, flags);
}
@@ -1245,7 +1244,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @c: Execution context
* @conn: Connection pointer
*/
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn)
{
if (conn->events == CLOSED)
return;
@@ -1463,7 +1462,7 @@ static void tcp_bind_outbound(const struct ctx *c,
* @optlen: Bytes in options: caller MUST ensure available length
* @now: Current timestamp
*/
-static void tcp_conn_from_tap(struct ctx *c, sa_family_t af,
+static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
const void *saddr, const void *daddr,
const struct tcphdr *th, const char *opts,
size_t optlen, const struct timespec *now)
@@ -1628,7 +1627,7 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
*
* #syscalls recvmsg
*/
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+static int tcp_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
return tcp_buf_data_from_sock(c, conn);
}
@@ -1644,8 +1643,8 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
*
* Return: count of consumed packets
*/
-static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
- const struct pool *p, int idx)
+static int tcp_data_from_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+ const struct pool *p, int idx)
{
int i, iov_i, ack = 0, fin = 0, retr = 0, keep = -1, partial_send = 0;
uint16_t max_ack_seq_wnd = conn->wnd_from_tap;
@@ -1842,7 +1841,8 @@ out:
* @opts: Pointer to start of options
* @optlen: Bytes in options: caller MUST ensure available length
*/
-static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_conn_from_sock_finish(const struct ctx *c,
+ struct tcp_tap_conn *conn,
const struct tcphdr *th,
const char *opts, size_t optlen)
{
@@ -1885,7 +1885,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
*
* Return: count of consumed packets
*/
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now)
{
@@ -2023,7 +2023,7 @@ reset:
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
+static void tcp_connect_finish(const struct ctx *c, struct tcp_tap_conn *conn)
{
socklen_t sl;
int so;
@@ -2049,8 +2049,8 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
* @sa: Peer socket address (from accept())
* @now: Current timestamp
*/
-static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
- const struct timespec *now)
+static void tcp_tap_conn_from_sock(const struct ctx *c, union flow *flow,
+ int s, const struct timespec *now)
{
struct tcp_tap_conn *conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp);
uint64_t hash;
@@ -2081,7 +2081,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, union flow *flow, int s,
* @ref: epoll reference of listening socket
* @now: Current timestamp
*/
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
const struct timespec *now)
{
const struct flowside *ini;
@@ -2146,7 +2146,7 @@ cancel:
*
* #syscalls timerfd_gettime arm:timerfd_gettime64 i686:timerfd_gettime64
*/
-void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref)
{
struct itimerspec check_armed = { { 0 }, { 0 } };
struct tcp_tap_conn *conn = &FLOW(ref.flow)->tcp;
@@ -2210,7 +2210,8 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
* @ref: epoll reference
* @events: epoll events bitmap
*/
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events)
{
struct tcp_tap_conn *conn = conn_at_sidx(ref.flowside);
@@ -2494,6 +2495,40 @@ static bool tcp_probe_peek_offset_cap(sa_family_t af)
return ret;
}
+#ifdef HAS_SND_WND
+/**
+ * tcp_probe_snd_wnd_cap() - Check if TCP_INFO reports tcpi_snd_wnd
+ *
+ * Return: true if supported, false otherwise
+ */
+static bool tcp_probe_snd_wnd_cap(void)
+{
+ struct tcp_info tinfo;
+ socklen_t sl = sizeof(tinfo);
+ int s;
+
+ s = socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, IPPROTO_TCP);
+ if (s < 0) {
+ warn_perror("Temporary TCP socket creation failed");
+ return false;
+ }
+
+ if (getsockopt(s, SOL_TCP, TCP_INFO, &tinfo, &sl)) {
+ warn_perror("Failed to get TCP_INFO on temporary socket");
+ close(s);
+ return false;
+ }
+
+ close(s);
+
+ if (sl < (offsetof(struct tcp_info, tcpi_snd_wnd) +
+ sizeof(tinfo.tcpi_snd_wnd)))
+ return false;
+
+ return true;
+}
+#endif /* HAS_SND_WND */
+
/**
* tcp_init() - Get initial sequence, hash secret, initialise per-socket data
* @c: Execution context
@@ -2527,6 +2562,12 @@ int tcp_init(struct ctx *c)
(!c->ifi6 || tcp_probe_peek_offset_cap(AF_INET6));
debug("SO_PEEK_OFF%ssupported", peek_offset_cap ? " " : " not ");
+#ifdef HAS_SND_WND
+ snd_wnd_cap = tcp_probe_snd_wnd_cap();
+#endif
+ debug("TCP_INFO tcpi_snd_wnd field%ssupported",
+ snd_wnd_cap ? " " : " not ");
+
return 0;
}
diff --git a/tcp.h b/tcp.h
index e9ff0191..5585924f 100644
--- a/tcp.h
+++ b/tcp.h
@@ -10,11 +10,12 @@
struct ctx;
-void tcp_timer_handler(struct ctx *c, union epoll_ref ref);
-void tcp_listen_handler(struct ctx *c, union epoll_ref ref,
+void tcp_timer_handler(const struct ctx *c, union epoll_ref ref);
+void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
const struct timespec *now);
-void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events);
-int tcp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
+void tcp_sock_handler(const struct ctx *c, union epoll_ref ref,
+ uint32_t events);
+int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
const struct pool *p, int idx, const struct timespec *now);
int tcp_sock_init(const struct ctx *c, sa_family_t af, const void *addr,
@@ -58,16 +59,12 @@ union tcp_listen_epoll_ref {
* @fwd_in: Port forwarding configuration for inbound packets
* @fwd_out: Port forwarding configuration for outbound packets
* @timer_run: Timestamp of most recent timer run
- * @kernel_snd_wnd: Kernel reports sending window (with commit 8f7baad7f035)
* @pipe_size: Size of pipes for spliced connections
*/
struct tcp_ctx {
struct fwd_ports fwd_in;
struct fwd_ports fwd_out;
struct timespec timer_run;
-#ifdef HAS_SND_WND
- int kernel_snd_wnd;
-#endif
size_t pipe_size;
};
diff --git a/tcp_buf.c b/tcp_buf.c
index 1a398461..c886c92b 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -239,7 +239,7 @@ void tcp_flags_flush(const struct ctx *c)
* @frames: Two-dimensional array containing queued frames with sub-iovs
* @num_frames: Number of entries in the two arrays to be compared
*/
-static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
+static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
struct iovec (*frames)[TCP_NUM_IOVS], int num_frames)
{
int i;
@@ -264,7 +264,7 @@ static void tcp_revert_seq(struct ctx *c, struct tcp_tap_conn **conns,
* tcp_payload_flush() - Send out buffers for segments with data
* @c: Execution context
*/
-void tcp_payload_flush(struct ctx *c)
+void tcp_payload_flush(const struct ctx *c)
{
size_t m;
@@ -293,7 +293,7 @@ void tcp_payload_flush(struct ctx *c)
*
* Return: negative error code on connection reset, 0 otherwise
*/
-int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
{
struct tcp_flags_t *payload;
struct iovec *iov;
@@ -361,7 +361,7 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
* @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
* @seq: Sequence number to be sent
*/
-static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
+static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
ssize_t dlen, int no_csum, uint32_t seq)
{
struct iovec *iov;
@@ -405,7 +405,7 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn,
*
* #syscalls recvmsg
*/
-int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn)
{
uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
diff --git a/tcp_buf.h b/tcp_buf.h
index 3db4c56e..8d4b615a 100644
--- a/tcp_buf.h
+++ b/tcp_buf.h
@@ -9,8 +9,8 @@
void tcp_sock4_iov_init(const struct ctx *c);
void tcp_sock6_iov_init(const struct ctx *c);
void tcp_flags_flush(const struct ctx *c);
-void tcp_payload_flush(struct ctx *c);
-int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
-int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
+void tcp_payload_flush(const struct ctx *c);
+int tcp_buf_data_from_sock(const struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags);
#endif /*TCP_BUF_H */
diff --git a/tcp_internal.h b/tcp_internal.h
index aa8bb64f..bd634be1 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -82,7 +82,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
conn_event_do(c, conn, event); \
} while (0)
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
#define tcp_rst(c, conn) \
do { \
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
@@ -94,7 +94,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
const uint16_t *check, uint32_t seq);
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
int force_seq, struct tcp_info *tinfo);
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags,
struct tcphdr *th, char *data, size_t *optlen);
#endif /* TCP_INTERNAL_H */
--
@@ -82,7 +82,7 @@ void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
conn_event_do(c, conn, event); \
} while (0)
-void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
+void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
#define tcp_rst(c, conn) \
do { \
flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
@@ -94,7 +94,7 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
const uint16_t *check, uint32_t seq);
int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
int force_seq, struct tcp_info *tinfo);
-int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
+int tcp_prepare_flags(const struct ctx *c, struct tcp_tap_conn *conn, int flags,
struct tcphdr *th, char *data, size_t *optlen);
#endif /* TCP_INTERNAL_H */
--
2.46.0
next prev parent reply other threads:[~2024-09-18 1:53 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-09-18 1:53 [PATCH 0/4] Some small TCP cleanups David Gibson
2024-09-18 1:53 ` [PATCH 1/4] tcp: Make some extra functions private David Gibson
2024-09-18 1:53 ` David Gibson [this message]
2024-09-18 1:53 ` [PATCH 3/4] tcp: Simplify ifdef logic in tcp_update_seqack_wnd() David Gibson
2024-09-18 1:53 ` [PATCH 4/4] tcp: Make tcp_update_seqack_wnd()s force_seq parameter explicitly boolean David Gibson
2024-09-18 18:13 ` [PATCH 0/4] Some small TCP cleanups Stefano Brivio
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240918015307.1492846-3-david@gibson.dropbear.id.au \
--to=david@gibson.dropbear.id.au \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).