* [PATCH 0/2] Support for SO_PEEK_OFF when a available @ 2024-04-20 19:19 Jon Maloy 2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy 2024-04-20 19:19 ` [PATCH 2/2] tcp: allow retransmit when peer receive window is zero Jon Maloy 0 siblings, 2 replies; 15+ messages in thread From: Jon Maloy @ 2024-04-20 19:19 UTC (permalink / raw) To: passt-dev, sbrivio, lvivier, dgibson, jmaloy First commit introduces support for SO_PEEK_OFF socket option. Second commit adds a workaround for a kernel TCP bug. Jon Maloy (2): tcp: leverage support of SO_PEEK_OFF socket option when available tcp: allow retransmit when peer receive window is zero tcp.c | 63 +++++++++++++++++++++++++++++++++++++++++++----------- tcp_conn.h | 2 ++ 2 files changed, 53 insertions(+), 12 deletions(-) -- 2.42.0 ^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-20 19:19 [PATCH 0/2] Support for SO_PEEK_OFF when a available Jon Maloy @ 2024-04-20 19:19 ` Jon Maloy 2024-04-23 17:50 ` Stefano Brivio 2024-04-24 0:44 ` David Gibson 2024-04-20 19:19 ` [PATCH 2/2] tcp: allow retransmit when peer receive window is zero Jon Maloy 1 sibling, 2 replies; 15+ messages in thread From: Jon Maloy @ 2024-04-20 19:19 UTC (permalink / raw) To: passt-dev, sbrivio, lvivier, dgibson, jmaloy The kernel may support recvmsg(MSG_PEEK), starting reading data from a given offset set by the SO_PEEK_OFF socket option. This makes it possible to avoid repeated reading of already read initial bytes of a received message, hence saving read cycles when forwarding TCP messages in the host->name space direction. In this commit, we add functionality to leverage this feature when available, while we fall back to the previous behavior when not. Measurements with iperf3 shows that throughput increases with 15-20 percent in the host->namespace direction when this feature is used. Signed-off-by: Jon Maloy <jmaloy@redhat.com> --- tcp.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/tcp.c b/tcp.c index 905d26f..95d400a 100644 --- a/tcp.c +++ b/tcp.c @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ +static bool peek_offset_cap = false; static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; @@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +static void set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return; + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + perror("Failed to set SO_PEEK_OFF\n"); +} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c, if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) goto cancel; } - + set_peek_offset(s, 0); conn = &flow->tcp; conn->f.type = FLOW_TCP; conn->sock = s; @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; + if (peek_offset_cap) { + /* Don't use discard buffer */ + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen -= 1; + + /* Keep kernel sk_peek_off in synch */ + set_peek_offset(s, already_sent); + } + /* Receive into buffers, don't dequeue until acknowledged by guest. */ do len = recvmsg(s, &mh_sock, MSG_PEEK); @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, s, (struct sockaddr *)&sa)) return; + set_peek_offset(s, 0); tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, (struct sockaddr *)&sa, now); @@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c) int tcp_init(struct ctx *c) { unsigned b; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s < 0) { + perror("Temporary tcp socket creation failed\n"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) { + peek_offset_cap = true; + } + close(s); + } + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not "); return 0; } -- @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ +static bool peek_offset_cap = false; static char tcp_buf_discard [MAX_WINDOW]; static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; @@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; +static void set_peek_offset(int s, int offset) +{ + if (!peek_offset_cap) + return; + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) + perror("Failed to set SO_PEEK_OFF\n"); +} + /** * tcp_conn_epoll_events() - epoll events mask for given connection state * @events: Current connection events @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c, if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) goto cancel; } - + set_peek_offset(s, 0); conn = &flow->tcp; conn->f.type = FLOW_TCP; conn->sock = s; @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; + if (peek_offset_cap) { + /* Don't use discard buffer */ + mh_sock.msg_iov = &iov_sock[1]; + mh_sock.msg_iovlen -= 1; + + /* Keep kernel sk_peek_off in synch */ + set_peek_offset(s, already_sent); + } + /* Receive into buffers, don't dequeue until acknowledged by guest. */ do len = recvmsg(s, &mh_sock, MSG_PEEK); @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } - sendlen = len - already_sent; + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; @@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, s, (struct sockaddr *)&sa)) return; + set_peek_offset(s, 0); tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, (struct sockaddr *)&sa, now); @@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c) int tcp_init(struct ctx *c) { unsigned b; + int s; for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) tc_hash[b] = FLOW_SIDX_NONE; @@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c) NS_CALL(tcp_ns_socks_init, c); } + /* Probe for SO_PEEK_OFF support */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (s < 0) { + perror("Temporary tcp socket creation failed\n"); + } else { + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) { + peek_offset_cap = true; + } + close(s); + } + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not "); return 0; } -- 2.42.0 ^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy @ 2024-04-23 17:50 ` Stefano Brivio 2024-04-24 0:48 ` David Gibson 2024-04-24 0:44 ` David Gibson 1 sibling, 1 reply; 15+ messages in thread From: Stefano Brivio @ 2024-04-23 17:50 UTC (permalink / raw) To: Jon Maloy; +Cc: passt-dev, lvivier, dgibson On Sat, 20 Apr 2024 15:19:19 -0400 Jon Maloy <jmaloy@redhat.com> wrote: > The kernel may support recvmsg(MSG_PEEK), starting reading data from a > given offset set by the SO_PEEK_OFF socket option. It would be practical to mention in commit message and in a code comment which kernel commit introduced the feature, that is, your commit 05ea491641d3 ("tcp: add support for SO_PEEK_OFF socket option") -- even if it's on net-next, better than nothing (net-next might be rebased but it's quite rare). Also the (forecast, at this point) kernel version where this will be introduced would be nice to have. > This makes it > possible to avoid repeated reading of already read initial bytes of a > received message, hence saving read cycles when forwarding TCP messages > in the host->name space direction. > > In this commit, we add functionality to leverage this feature when available, > while we fall back to the previous behavior when not. > > Measurements with iperf3 shows that throughput increases with 15-20 percent > in the host->namespace direction when this feature is used. > > Signed-off-by: Jon Maloy <jmaloy@redhat.com> > --- > tcp.c | 37 +++++++++++++++++++++++++++++++++++-- > 1 file changed, 35 insertions(+), 2 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 905d26f..95d400a 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; > static unsigned int tcp6_l2_buf_used; > > /* recvmsg()/sendmsg() data for tap */ > +static bool peek_offset_cap = false; No need to initialise, it's static. The comment just above referred to tcp_buf_discard and iov_sock ("data"), now you're adding a flag just under it, which is a bit confusing. I would rather leave the original comment for tcp_buf_discard and iov_sock, and add another one, say...: > static char tcp_buf_discard [MAX_WINDOW]; > static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; /* Does the kernel support TCP_PEEK_OFF? */ static bool peek_offset_cap; > @@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, > int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; > int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; > > +static void set_peek_offset(int s, int offset) A kerneldoc-style function comment would be nice, like all other functions in this file: /** * set_peek_offset() - Set SO_PEEK_OFF offset on a socket if supported * @s Socket * @offset Offset in bytes */ > +{ > + if (!peek_offset_cap) > + return; > + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) gcc ('make') says: In function ‘set_peek_offset’, inlined from ‘tcp_listen_handler’ at tcp.c:2819:2: tcp.c:592:13: warning: ‘s’ may be used uninitialized [-Wmaybe-uninitialized] 592 | if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tcp.c: In function ‘tcp_listen_handler’: tcp.c:2815:13: note: ‘s’ was declared here 2815 | int s; | ^ clang-tidy ('make clang-tidy'): /home/sbrivio/passt/tcp.c:2819:2: error: 1st function call argument is an uninitialized value [clang-analyzer-core.CallAndMessage,-warnings-as-errors] set_peek_offset(s, 0); ^ ~ /home/sbrivio/passt/tcp.c:2815:2: note: 's' declared without an initial value int s; ^~~~~ /home/sbrivio/passt/tcp.c:2817:6: note: Assuming field 'no_tcp' is 0 if (c->no_tcp || !(flow = flow_alloc())) ^~~~~~~~~ /home/sbrivio/passt/tcp.c:2817:6: note: Left side of '||' is false /home/sbrivio/passt/tcp.c:2817:21: note: Assuming 'flow' is non-null if (c->no_tcp || !(flow = flow_alloc())) ^~~~ /home/sbrivio/passt/tcp.c:2817:2: note: Taking false branch if (c->no_tcp || !(flow = flow_alloc())) ^ /home/sbrivio/passt/tcp.c:2819:2: note: 1st function call argument is an uninitialized value set_peek_offset(s, 0); ^ ~ /home/sbrivio/passt/tcp.c:2819:18: error: variable 's' is uninitialized when used here [clang-diagnostic-uninitialized,-warnings-as-errors] set_peek_offset(s, 0); ^ /home/sbrivio/passt/tcp.c:2815:7: note: initialize the variable 's' to silence this warning int s; ^ = 0 and cppcheck ('make cppcheck'): tcp.c:2819:18: error: Uninitialized variable: s [uninitvar] set_peek_offset(s, 0); ^ tcp.c:2817:16: note: Assuming condition is false if (c->no_tcp || !(flow = flow_alloc())) ^ tcp.c:2819:18: note: Uninitialized variable: s set_peek_offset(s, 0); ^ make: *** [Makefile:296: cppcheck] Error 1 > + perror("Failed to set SO_PEEK_OFF\n"); > +} > + > /** > * tcp_conn_epoll_events() - epoll events mask for given connection state > * @events: Current connection events > @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c, > if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) > goto cancel; > } > - Spurious change. > + set_peek_offset(s, 0); Do we really need to initialise it to zero on a new connection? Extra system calls on this path matter for latency of connection establishment. > conn = &flow->tcp; > conn->f.type = FLOW_TCP; > conn->sock = s; > @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > if (iov_rem) > iov_sock[fill_bufs].iov_len = iov_rem; > > + if (peek_offset_cap) { > + /* Don't use discard buffer */ > + mh_sock.msg_iov = &iov_sock[1]; > + mh_sock.msg_iovlen -= 1; > + > + /* Keep kernel sk_peek_off in synch */ While Wiktionary lists "synch" as "Alternative spelling of sync", I would argue that "sync" is much more common and less surprising. > + set_peek_offset(s, already_sent); Note that there's an early return before this point where already_sent is set to 0. Don't you need to set_peek_offset() also in that case? I guess this would be easier to follow if both assignments of already_sent, earlier in this function, were followed by a set_peek_offset() call. Or do we risk calling it spuriously? > + } > + > /* Receive into buffers, don't dequeue until acknowledged by guest. */ > do > len = recvmsg(s, &mh_sock, MSG_PEEK); > @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > return 0; > } > > - sendlen = len - already_sent; > + sendlen = len; > + if (!peek_offset_cap) > + sendlen -= already_sent; > if (sendlen <= 0) { > conn_flag(c, conn, STALLED); > return 0; > @@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, > tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, > s, (struct sockaddr *)&sa)) > return; > + set_peek_offset(s, 0); Same here, do we really need to initialise it to zero? If yes, it would be nice to leave a blank line after that 'return' as it was before. > > tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, > (struct sockaddr *)&sa, now); > @@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c) > int tcp_init(struct ctx *c) > { > unsigned b; > + int s; > > for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) > tc_hash[b] = FLOW_SIDX_NONE; > @@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c) > NS_CALL(tcp_ns_socks_init, c); > } > > + /* Probe for SO_PEEK_OFF support */ > + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); By default, we should always pass SOCK_CLOEXEC to socket(), just in case we miss closing one. > + if (s < 0) { > + perror("Temporary tcp socket creation failed\n"); This should be a warn() call, and s/tcp/TCP/. > + } else { > + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) { No particular reason to exceed 80 columns here, and curly brackets aren't needed (coding style is the same as kernel). For consistency with the rest of the codebase: &((int) { 0 }). > + peek_offset_cap = true; > + } > + close(s); > + } > + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not "); This should be a debug() call, a bit too much for info(). > return 0; > } > ...I'm still reviewing 2/2, sorry for the delay. -- Stefano ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-23 17:50 ` Stefano Brivio @ 2024-04-24 0:48 ` David Gibson 2024-04-24 18:30 ` Stefano Brivio 2024-04-25 23:06 ` Jon Maloy 0 siblings, 2 replies; 15+ messages in thread From: David Gibson @ 2024-04-24 0:48 UTC (permalink / raw) To: Stefano Brivio; +Cc: Jon Maloy, passt-dev, lvivier, dgibson [-- Attachment #1: Type: text/plain, Size: 1032 bytes --] On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote: > On Sat, 20 Apr 2024 15:19:19 -0400 > Jon Maloy <jmaloy@redhat.com> wrote: [snip] > > + set_peek_offset(s, 0); > > Do we really need to initialise it to zero on a new connection? Extra > system calls on this path matter for latency of connection > establishment. Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting it to 0, rather than the default -1. We could lazily enable it, but we'd need either to a) do it later in the handshake (maybe when we set ESTABLISHED), but we'd need to be careful it is always set before the first MSG_PEEK or b) keep track of whether it's set on a per-socket basis (this would have the advantage of robustness if we ever encountered a kernel that weirdly allows it for some but not all TCP sockets). -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 833 bytes --] ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-24 0:48 ` David Gibson @ 2024-04-24 18:30 ` Stefano Brivio 2024-04-26 3:27 ` David Gibson 2024-04-25 23:06 ` Jon Maloy 1 sibling, 1 reply; 15+ messages in thread From: Stefano Brivio @ 2024-04-24 18:30 UTC (permalink / raw) To: David Gibson; +Cc: Jon Maloy, passt-dev, lvivier, dgibson On Wed, 24 Apr 2024 10:48:05 +1000 David Gibson <david@gibson.dropbear.id.au> wrote: > On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote: > > On Sat, 20 Apr 2024 15:19:19 -0400 > > Jon Maloy <jmaloy@redhat.com> wrote: > [snip] > > > + set_peek_offset(s, 0); > > > > Do we really need to initialise it to zero on a new connection? Extra > > system calls on this path matter for latency of connection > > establishment. > > Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting > it to 0, rather than the default -1. By the way of which, this is not documented at this point -- a man page patch (linux-man and linux-api lists) would be nice. > We could lazily enable it, but > we'd need either to a) do it later in the handshake (maybe when we set > ESTABLISHED), but we'd need to be careful it is always set before the > first MSG_PEEK I was actually thinking that we could set it only as we receive data (not every connection will receive data), and keep this out of the handshake (which we want to keep "faster", I think). And setting it as we mark a connection as ESTABLISHED should have the same effect on latency as setting it on a new connection -- that's not really lazy. So, actually: > or b) keep track of whether it's set on a per-socket > basis (this would have the advantage of robustness if we ever > encountered a kernel that weirdly allows it for some but not all TCP > sockets). ...this could be done as we receive data in tcp_data_from_sock(), with a new flag in tcp_tap_conn::flags, to avoid adding latency to the handshake. It also looks more robust to me, and done/checked in a single place where we need it. We have just three bits left there which isn't great, but if we need to save one at a later point, we can drop this new flag easily. -- Stefano ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-24 18:30 ` Stefano Brivio @ 2024-04-26 3:27 ` David Gibson 2024-04-26 5:58 ` Stefano Brivio 0 siblings, 1 reply; 15+ messages in thread From: David Gibson @ 2024-04-26 3:27 UTC (permalink / raw) To: Stefano Brivio; +Cc: Jon Maloy, passt-dev, lvivier, dgibson [-- Attachment #1: Type: text/plain, Size: 2598 bytes --] On Wed, Apr 24, 2024 at 08:30:44PM +0200, Stefano Brivio wrote: > On Wed, 24 Apr 2024 10:48:05 +1000 > David Gibson <david@gibson.dropbear.id.au> wrote: > > > On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote: > > > On Sat, 20 Apr 2024 15:19:19 -0400 > > > Jon Maloy <jmaloy@redhat.com> wrote: > > [snip] > > > > + set_peek_offset(s, 0); > > > > > > Do we really need to initialise it to zero on a new connection? Extra > > > system calls on this path matter for latency of connection > > > establishment. > > > > Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting > > it to 0, rather than the default -1. > > By the way of which, this is not documented at this point -- a man page > patch (linux-man and linux-api lists) would be nice. > > > We could lazily enable it, but > > we'd need either to a) do it later in the handshake (maybe when we set > > ESTABLISHED), but we'd need to be careful it is always set before the > > first MSG_PEEK > > I was actually thinking that we could set it only as we receive data > (not every connection will receive data), and keep this out of the > handshake (which we want to keep "faster", I think). That makes sense, but I think it would need a per-connection flag. > And setting it as we mark a connection as ESTABLISHED should have the > same effect on latency as setting it on a new connection -- that's not > really lazy. So, actually: Good point. > > or b) keep track of whether it's set on a per-socket > > basis (this would have the advantage of robustness if we ever > > encountered a kernel that weirdly allows it for some but not all TCP > > sockets). > > ...this could be done as we receive data in tcp_data_from_sock(), with > a new flag in tcp_tap_conn::flags, to avoid adding latency to the > handshake. It also looks more robust to me, and done/checked in a > single place where we need it. > > We have just three bits left there which isn't great, but if we need to > save one at a later point, we can drop this new flag easily. I just realised that folding the feature detection into this is a bit costlier than I thought. If we globally probe the feature we just need one bit per connection: is SO_PEEK_OFF set yet or not. If we tried to probe per-connection we'd need a tristate: haven't tried / SO_PEEK_OFF enabled / tried and failed. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 833 bytes --] ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-26 3:27 ` David Gibson @ 2024-04-26 5:58 ` Stefano Brivio 2024-04-29 1:46 ` David Gibson 0 siblings, 1 reply; 15+ messages in thread From: Stefano Brivio @ 2024-04-26 5:58 UTC (permalink / raw) To: David Gibson; +Cc: Jon Maloy, passt-dev, lvivier, dgibson On Fri, 26 Apr 2024 13:27:11 +1000 David Gibson <david@gibson.dropbear.id.au> wrote: > On Wed, Apr 24, 2024 at 08:30:44PM +0200, Stefano Brivio wrote: > > On Wed, 24 Apr 2024 10:48:05 +1000 > > David Gibson <david@gibson.dropbear.id.au> wrote: > > > > > On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote: > > > > On Sat, 20 Apr 2024 15:19:19 -0400 > > > > Jon Maloy <jmaloy@redhat.com> wrote: > > > [snip] > > > > > + set_peek_offset(s, 0); > > > > > > > > Do we really need to initialise it to zero on a new connection? Extra > > > > system calls on this path matter for latency of connection > > > > establishment. > > > > > > Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting > > > it to 0, rather than the default -1. > > > > By the way of which, this is not documented at this point -- a man page > > patch (linux-man and linux-api lists) would be nice. > > > > > We could lazily enable it, but > > > we'd need either to a) do it later in the handshake (maybe when we set > > > ESTABLISHED), but we'd need to be careful it is always set before the > > > first MSG_PEEK > > > > I was actually thinking that we could set it only as we receive data > > (not every connection will receive data), and keep this out of the > > handshake (which we want to keep "faster", I think). > > That makes sense, but I think it would need a per-connection flag. Definitely. > > And setting it as we mark a connection as ESTABLISHED should have the > > same effect on latency as setting it on a new connection -- that's not > > really lazy. So, actually: > > Good point. > > > > or b) keep track of whether it's set on a per-socket > > > basis (this would have the advantage of robustness if we ever > > > encountered a kernel that weirdly allows it for some but not all TCP > > > sockets). > > > > ...this could be done as we receive data in tcp_data_from_sock(), with > > a new flag in tcp_tap_conn::flags, to avoid adding latency to the > > handshake. It also looks more robust to me, and done/checked in a > > single place where we need it. > > > > We have just three bits left there which isn't great, but if we need to > > save one at a later point, we can drop this new flag easily. > > I just realised that folding the feature detection into this is a bit > costlier than I thought. If we globally probe the feature we just > need one bit per connection: is SO_PEEK_OFF set yet or not. If we > tried to probe per-connection we'd need a tristate: haven't tried / > SO_PEEK_OFF enabled / tried and failed. I forgot to mention this part: what I wanted to propose was actually still a global probe, so that we don't waste one system call per connection on kernels not supporting this (a substantial use case for a couple of years from now?), which probably outweighs the advantage of the weird, purely theoretical kernel not supporting the feature for some sockets only. And then something like PEEK_OFFSET_SET (SO_PEEK_OFF_SET sounds awkward to me) on top. Another advantage is avoiding the tristate you described. -- Stefano ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-26 5:58 ` Stefano Brivio @ 2024-04-29 1:46 ` David Gibson 0 siblings, 0 replies; 15+ messages in thread From: David Gibson @ 2024-04-29 1:46 UTC (permalink / raw) To: Stefano Brivio; +Cc: Jon Maloy, passt-dev, lvivier, dgibson [-- Attachment #1: Type: text/plain, Size: 3630 bytes --] On Fri, Apr 26, 2024 at 07:58:32AM +0200, Stefano Brivio wrote: > On Fri, 26 Apr 2024 13:27:11 +1000 > David Gibson <david@gibson.dropbear.id.au> wrote: > > > On Wed, Apr 24, 2024 at 08:30:44PM +0200, Stefano Brivio wrote: > > > On Wed, 24 Apr 2024 10:48:05 +1000 > > > David Gibson <david@gibson.dropbear.id.au> wrote: > > > > > > > On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote: > > > > > On Sat, 20 Apr 2024 15:19:19 -0400 > > > > > Jon Maloy <jmaloy@redhat.com> wrote: > > > > [snip] > > > > > > + set_peek_offset(s, 0); > > > > > > > > > > Do we really need to initialise it to zero on a new connection? Extra > > > > > system calls on this path matter for latency of connection > > > > > establishment. > > > > > > > > Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting > > > > it to 0, rather than the default -1. > > > > > > By the way of which, this is not documented at this point -- a man page > > > patch (linux-man and linux-api lists) would be nice. > > > > > > > We could lazily enable it, but > > > > we'd need either to a) do it later in the handshake (maybe when we set > > > > ESTABLISHED), but we'd need to be careful it is always set before the > > > > first MSG_PEEK > > > > > > I was actually thinking that we could set it only as we receive data > > > (not every connection will receive data), and keep this out of the > > > handshake (which we want to keep "faster", I think). > > > > That makes sense, but I think it would need a per-connection flag. > > Definitely. > > > > And setting it as we mark a connection as ESTABLISHED should have the > > > same effect on latency as setting it on a new connection -- that's not > > > really lazy. So, actually: > > > > Good point. > > > > > > or b) keep track of whether it's set on a per-socket > > > > basis (this would have the advantage of robustness if we ever > > > > encountered a kernel that weirdly allows it for some but not all TCP > > > > sockets). > > > > > > ...this could be done as we receive data in tcp_data_from_sock(), with > > > a new flag in tcp_tap_conn::flags, to avoid adding latency to the > > > handshake. It also looks more robust to me, and done/checked in a > > > single place where we need it. > > > > > > We have just three bits left there which isn't great, but if we need to > > > save one at a later point, we can drop this new flag easily. > > > > I just realised that folding the feature detection into this is a bit > > costlier than I thought. If we globally probe the feature we just > > need one bit per connection: is SO_PEEK_OFF set yet or not. If we > > tried to probe per-connection we'd need a tristate: haven't tried / > > SO_PEEK_OFF enabled / tried and failed. > > I forgot to mention this part: what I wanted to propose was actually > still a global probe, so that we don't waste one system call per > connection on kernels not supporting this (a substantial use case for a > couple of years from now?), which probably outweighs the advantage of > the weird, purely theoretical kernel not supporting the feature for > some sockets only. > And then something like PEEK_OFFSET_SET (SO_PEEK_OFF_SET sounds awkward > to me) on top. Another advantage is avoiding the tristate you described. Right, having thought it through I agree this is a better approach. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 833 bytes --] ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-24 0:48 ` David Gibson 2024-04-24 18:30 ` Stefano Brivio @ 2024-04-25 23:06 ` Jon Maloy 1 sibling, 0 replies; 15+ messages in thread From: Jon Maloy @ 2024-04-25 23:06 UTC (permalink / raw) To: David Gibson, Stefano Brivio; +Cc: passt-dev, lvivier, dgibson On 2024-04-23 20:48, David Gibson wrote: > On Tue, Apr 23, 2024 at 07:50:10PM +0200, Stefano Brivio wrote: >> On Sat, 20 Apr 2024 15:19:19 -0400 >> Jon Maloy <jmaloy@redhat.com> wrote: > [snip] >>> + set_peek_offset(s, 0); >> Do we really need to initialise it to zero on a new connection? Extra >> system calls on this path matter for latency of connection >> establishment. Yes. Although I moved it to tcp_conn_from_tap() in the next version, to make it more symmetric with the one in tcp_tap_conn_from_sock() > Sort of, yes: we need to enable the SO_PEEK_OFF behaviour by setting > it to 0, rather than the default -1. We could lazily enable it, but > we'd need either to a) do it later in the handshake (maybe when we set > ESTABLISHED), but we'd need to be careful it is always set before the > first MSG_PEEK or b) keep track of whether it's set on a per-socket > basis (this would have the advantage of robustness if we ever > encountered a kernel that weirdly allows it for some but not all TCP > sockets). > ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy 2024-04-23 17:50 ` Stefano Brivio @ 2024-04-24 0:44 ` David Gibson 2024-04-25 23:23 ` Jon Maloy 1 sibling, 1 reply; 15+ messages in thread From: David Gibson @ 2024-04-24 0:44 UTC (permalink / raw) To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson [-- Attachment #1: Type: text/plain, Size: 4751 bytes --] On Sat, Apr 20, 2024 at 03:19:19PM -0400, Jon Maloy wrote: > The kernel may support recvmsg(MSG_PEEK), starting reading data from a Not worth a respin on its own, but I think the comma above is misplaced, and for me makes the sentence much harder to read. > given offset set by the SO_PEEK_OFF socket option. This makes it > possible to avoid repeated reading of already read initial bytes of a > received message, hence saving read cycles when forwarding TCP messages > in the host->name space direction. > > In this commit, we add functionality to leverage this feature when available, > while we fall back to the previous behavior when not. > > Measurements with iperf3 shows that throughput increases with 15-20 percent > in the host->namespace direction when this feature is used. > > Signed-off-by: Jon Maloy <jmaloy@redhat.com> > --- > tcp.c | 37 +++++++++++++++++++++++++++++++++++-- > 1 file changed, 35 insertions(+), 2 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 905d26f..95d400a 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -505,6 +505,7 @@ static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; > static unsigned int tcp6_l2_buf_used; > > /* recvmsg()/sendmsg() data for tap */ > +static bool peek_offset_cap = false; > static char tcp_buf_discard [MAX_WINDOW]; > static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; > > @@ -582,6 +583,14 @@ static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX, > int init_sock_pool4 [TCP_SOCK_POOL_SIZE]; > int init_sock_pool6 [TCP_SOCK_POOL_SIZE]; > > +static void set_peek_offset(int s, int offset) > +{ > + if (!peek_offset_cap) > + return; > + if (setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &offset, sizeof(offset))) > + perror("Failed to set SO_PEEK_OFF\n"); > +} > + > /** > * tcp_conn_epoll_events() - epoll events mask for given connection state > * @events: Current connection events > @@ -1951,7 +1960,7 @@ static void tcp_conn_from_tap(struct ctx *c, > if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) > goto cancel; > } > - > + set_peek_offset(s, 0); > conn = &flow->tcp; > conn->f.type = FLOW_TCP; > conn->sock = s; > @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > if (iov_rem) > iov_sock[fill_bufs].iov_len = iov_rem; > > + if (peek_offset_cap) { > + /* Don't use discard buffer */ > + mh_sock.msg_iov = &iov_sock[1]; > + mh_sock.msg_iovlen -= 1; > + > + /* Keep kernel sk_peek_off in synch */ > + set_peek_offset(s, already_sent); I thought we didn't need to set SO_PEEK_OFF here - that it would track on its own, and we only needed to change it for retransmits. I don't think we even need to calculate 'already_sent' when we have SO_PEEK_OFF. In fact - if we set already_sent to 0 here, it might make things a bit cleaner than having to have special cases for adjusting the iov and sendlen. > + } > + > /* Receive into buffers, don't dequeue until acknowledged by guest. */ > do > len = recvmsg(s, &mh_sock, MSG_PEEK); > @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > return 0; > } > > - sendlen = len - already_sent; > + sendlen = len; > + if (!peek_offset_cap) > + sendlen -= already_sent; > if (sendlen <= 0) { > conn_flag(c, conn, STALLED); > return 0; > @@ -2718,6 +2738,7 @@ void tcp_listen_handler(struct ctx *c, union epoll_ref ref, > tcp_splice_conn_from_sock(c, ref.tcp_listen, &flow->tcp_splice, > s, (struct sockaddr *)&sa)) > return; > + set_peek_offset(s, 0); > > tcp_tap_conn_from_sock(c, ref.tcp_listen, &flow->tcp, s, > (struct sockaddr *)&sa, now); > @@ -3042,6 +3063,7 @@ static void tcp_sock_refill_init(const struct ctx *c) > int tcp_init(struct ctx *c) > { > unsigned b; > + int s; > > for (b = 0; b < TCP_HASH_TABLE_SIZE; b++) > tc_hash[b] = FLOW_SIDX_NONE; > @@ -3065,6 +3087,17 @@ int tcp_init(struct ctx *c) > NS_CALL(tcp_ns_socks_init, c); > } > > + /* Probe for SO_PEEK_OFF support */ > + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); > + if (s < 0) { > + perror("Temporary tcp socket creation failed\n"); > + } else { > + if (!setsockopt(s, SOL_SOCKET, SO_PEEK_OFF, &(int){0}, sizeof(int))) { > + peek_offset_cap = true; > + } > + close(s); > + } > + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not "); Should be an info(). > return 0; > } > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 833 bytes --] ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-24 0:44 ` David Gibson @ 2024-04-25 23:23 ` Jon Maloy 2024-04-26 3:29 ` David Gibson 0 siblings, 1 reply; 15+ messages in thread From: Jon Maloy @ 2024-04-25 23:23 UTC (permalink / raw) To: David Gibson; +Cc: passt-dev, sbrivio, lvivier, dgibson On 2024-04-23 20:44, David Gibson wrote: > On Sat, Apr 20, 2024 at 03:19:19PM -0400, Jon Maloy wrote: >> The kernel may support recvmsg(MSG_PEEK), starting reading data from a > Not worth a respin on its own, but I think the comma above is > misplaced, and for me makes the sentence much harder to read. > >> given offset set by the SO_PEEK_OFF socket option. This makes it >> possible to avoid repeated reading of already read initial bytes of a >> received message, hence saving read cycles when forwarding TCP messages >> in the host->name space direction. >> >> In this commit, we add functionality to leverage this feature when available, [...] >> @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) >> if (iov_rem) >> iov_sock[fill_bufs].iov_len = iov_rem; >> >> + if (peek_offset_cap) { >> + /* Don't use discard buffer */ >> + mh_sock.msg_iov = &iov_sock[1]; >> + mh_sock.msg_iovlen -= 1; >> + >> + /* Keep kernel sk_peek_off in synch */ >> + set_peek_offset(s, already_sent); > I thought we didn't need to set SO_PEEK_OFF here - that it would track > on its own, and we only needed to change it for retransmits. I don't > think we even need to calculate 'already_sent' when we have > SO_PEEK_OFF. In fact - if we set already_sent to 0 here, it might > make things a bit cleaner than having to have special cases for > adjusting the iov and sendlen. In theory yes. I tried it for a while, using SEQ_GE(max_ack_seq, ack_seq) as criteria for retransmission. I observed some strange behavior, like retransmits that seemingly did not come from fast retransmit or timer retransmit, and that the kernel 'sk_peek_off' didn´t always have the expected value when comparing with 'already_sent´. Since my focus was on the zero-window issue I decided to skip this for now and take the safe option. I may revisit this later. > >> + } >> + >> /* Receive into buffers, don't dequeue until acknowledged by guest. */ >> do >> len = recvmsg(s, &mh_sock, MSG_PEEK); >> @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) >> return 0; >> } >> [...] >> + peek_offset_cap = true; >> + } >> + close(s); >> + } >> + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ") > Should be an info(). Made it a debug() as suggested by Stefano. > >> return 0; >> } >> ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available 2024-04-25 23:23 ` Jon Maloy @ 2024-04-26 3:29 ` David Gibson 0 siblings, 0 replies; 15+ messages in thread From: David Gibson @ 2024-04-26 3:29 UTC (permalink / raw) To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson [-- Attachment #1: Type: text/plain, Size: 2910 bytes --] On Thu, Apr 25, 2024 at 07:23:28PM -0400, Jon Maloy wrote: > > > On 2024-04-23 20:44, David Gibson wrote: > > On Sat, Apr 20, 2024 at 03:19:19PM -0400, Jon Maloy wrote: > > > The kernel may support recvmsg(MSG_PEEK), starting reading data from a > > Not worth a respin on its own, but I think the comma above is > > misplaced, and for me makes the sentence much harder to read. > > > > > given offset set by the SO_PEEK_OFF socket option. This makes it > > > possible to avoid repeated reading of already read initial bytes of a > > > received message, hence saving read cycles when forwarding TCP messages > > > in the host->name space direction. > > > > > > In this commit, we add functionality to leverage this feature when available, > [...] > > > @@ -2174,6 +2183,15 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > > if (iov_rem) > > > iov_sock[fill_bufs].iov_len = iov_rem; > > > + if (peek_offset_cap) { > > > + /* Don't use discard buffer */ > > > + mh_sock.msg_iov = &iov_sock[1]; > > > + mh_sock.msg_iovlen -= 1; > > > + > > > + /* Keep kernel sk_peek_off in synch */ > > > + set_peek_offset(s, already_sent); > > I thought we didn't need to set SO_PEEK_OFF here - that it would track > > on its own, and we only needed to change it for retransmits. I don't > > think we even need to calculate 'already_sent' when we have > > SO_PEEK_OFF. In fact - if we set already_sent to 0 here, it might > > make things a bit cleaner than having to have special cases for > > adjusting the iov and sendlen. > In theory yes. > I tried it for a while, using SEQ_GE(max_ack_seq, ack_seq) as criteria for > retransmission. > I observed some strange behavior, like retransmits that seemingly did not > come from fast retransmit or timer retransmit, and that the kernel > 'sk_peek_off' > didn´t always have the expected value when comparing with 'already_sent´. Ouch, that sounds bad. I'm pretty sure that means there's a bug on one side or the other. > Since my focus was on the zero-window issue I decided to skip this for now > and take the safe option. > I may revisit this later. > > > > > + } > > > + > > > /* Receive into buffers, don't dequeue until acknowledged by guest. */ > > > do > > > len = recvmsg(s, &mh_sock, MSG_PEEK); > > > @@ -2195,7 +2213,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > > return 0; > > > } > [...] > > > + peek_offset_cap = true; > > > + } > > > + close(s); > > > + } > > > + printf("SO_PEEK_OFF%ssupported\n", peek_offset_cap ? " " : " not ") > > Should be an info(). > Made it a debug() as suggested by Stefano. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 833 bytes --] ^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH 2/2] tcp: allow retransmit when peer receive window is zero 2024-04-20 19:19 [PATCH 0/2] Support for SO_PEEK_OFF when a available Jon Maloy 2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy @ 2024-04-20 19:19 ` Jon Maloy 2024-04-24 1:04 ` David Gibson 1 sibling, 1 reply; 15+ messages in thread From: Jon Maloy @ 2024-04-20 19:19 UTC (permalink / raw) To: passt-dev, sbrivio, lvivier, dgibson, jmaloy A bug in kernel TCP may lead to a deadlock where a zero window is sent from the peer, while buffer reads doesn't lead to it being updated. At the same time, the zero window stops this side from sending out more data to trigger new advertisements to be sent from the peer. RFC 793 states that it always is permitted for a sender to send one byte of data even when the window is zero. This resolves the deadlock described above, so we choose to introduce it here as a last resort. We allow it both during fast and as keep-alives when the timer sees no activity on the connection. However, we notice that this solution doesn´t work well. Traffic sometimes goes to zero, and onley recovers after the timer has resolved the situation. Because of this, we chose to improve it slightly: The deadlock happens when a packet has been dropped at the peer end because of memory squeeze. We therefore consider it legitimate to retransmit that packet while considering the window size that was valid at the moment it was first transmitted. This works much better. It should be noted that although this solves the problem we have at hand, it is not a genuine solution to the kernel bug. There may well be TCP stacks around in other OS-es which don't do this probing. Signed-off-by: Jon Maloy <jmaloy@redhat.com> --- tcp.c | 26 ++++++++++++++++---------- tcp_conn.h | 2 ++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/tcp.c b/tcp.c index 95d400a..9dea151 100644 --- a/tcp.c +++ b/tcp.c @@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; + conn->max_seq_to_tap = conn->seq_to_tap; } /** @@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, * * #syscalls recvmsg */ -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled) { - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, plen, v4 = CONN_V4(conn); int s = conn->sock, i, ret = 0; @@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) return 0; } + sendlen = len; + if (!peek_offset_cap) + sendlen -= already_sent; sendlen = len; if (!peek_offset_cap) @@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) tcp_data_to_tap(c, conn, plen, no_csum, seq); seq += plen; } - + /* We need this to know this during retransmission: */ + if (SEQ_GT(seq, conn->max_seq_to_tap)) + conn->max_seq_to_tap = seq; conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; @@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, SEQ_GE(ack_seq, max_ack_seq)) { /* Fast re-transmit */ retr = !len && !th->fin && - ack_seq == max_ack_seq && - ntohs(th->window) == max_ack_seq_wnd; + ack_seq == max_ack_seq; max_ack_seq_wnd = ntohs(th->window); max_ack_seq = ack_seq; @@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, flow_trace(conn, "fast re-transmit, ACK: %u, previous sequence: %u", max_ack_seq, conn->seq_to_tap); + conn->seq_ack_from_tap = max_ack_seq; conn->seq_to_tap = max_ack_seq; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); } if (!iov_i) @@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); tcp_send_flag(c, conn, ACK); } @@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, tcp_tap_window_update(conn, ntohs(th->window)); - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); if (p->count - idx == 1) return 1; @@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); tcp_timer_ctl(c, conn); } } else { @@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) tcp_rst(c, conn); } } + } /** @@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) conn_event(c, conn, SOCK_FIN_RCVD); if (events & EPOLLIN) - tcp_data_from_sock(c, conn); + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); if (events & EPOLLOUT) tcp_update_seqack_wnd(c, conn, 0, NULL); diff --git a/tcp_conn.h b/tcp_conn.h index a5f5cfe..afcdec9 100644 --- a/tcp_conn.h +++ b/tcp_conn.h @@ -29,6 +29,7 @@ * @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap + * @max_seq_to_tap: Next seq after highest ever sent. Needeed during retransmit * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap @@ -100,6 +101,7 @@ struct tcp_tap_conn { uint16_t wnd_to_tap; uint32_t seq_to_tap; + uint32_t max_seq_to_tap; uint32_t seq_ack_from_tap; uint32_t seq_from_tap; uint32_t seq_ack_to_tap; -- @@ -29,6 +29,7 @@ * @wnd_from_tap: Last window size from tap, unscaled (as received) * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) * @seq_to_tap: Next sequence for packets to tap + * @max_seq_to_tap: Next seq after highest ever sent. Needeed during retransmit * @seq_ack_from_tap: Last ACK number received from tap * @seq_from_tap: Next sequence for packets from tap (not actually sent) * @seq_ack_to_tap: Last ACK number sent to tap @@ -100,6 +101,7 @@ struct tcp_tap_conn { uint16_t wnd_to_tap; uint32_t seq_to_tap; + uint32_t max_seq_to_tap; uint32_t seq_ack_from_tap; uint32_t seq_from_tap; uint32_t seq_ack_to_tap; -- 2.42.0 ^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH 2/2] tcp: allow retransmit when peer receive window is zero 2024-04-20 19:19 ` [PATCH 2/2] tcp: allow retransmit when peer receive window is zero Jon Maloy @ 2024-04-24 1:04 ` David Gibson 2024-04-24 18:31 ` Stefano Brivio 0 siblings, 1 reply; 15+ messages in thread From: David Gibson @ 2024-04-24 1:04 UTC (permalink / raw) To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson [-- Attachment #1: Type: text/plain, Size: 7413 bytes --] On Sat, Apr 20, 2024 at 03:19:20PM -0400, Jon Maloy wrote: > A bug in kernel TCP may lead to a deadlock where a zero window is sent > from the peer, while buffer reads doesn't lead to it being updated. > At the same time, the zero window stops this side from sending out > more data to trigger new advertisements to be sent from the peer. This is a bit confusing to me. I guess the buffer reads must be happening on the peer, not "this side", but that's not very clear. I think "received from the peer" might be better than "sent from the peer" to make it more obvious that the point of view is uniformly from "this side". > RFC 793 states that it always is permitted for a sender to send one byte > of data even when the window is zero. This resolves the deadlock described > above, so we choose to introduce it here as a last resort. We allow it both > during fast and as keep-alives when the timer sees no activity on > the connection. Looks like there's a missing noun after "during fast". > However, we notice that this solution doesn´t work well. Traffic sometimes > goes to zero, and onley recovers after the timer has resolved the situation. s/onley/only/ > Because of this, we chose to improve it slightly: The deadlock happens when a Is it actually useful to describe the "bad" workaround if we're using a different one? I don't see how the better workaround is an obvious extension of the bad one. > packet has been dropped at the peer end because of memory squeeze. We therefore > consider it legitimate to retransmit that packet while considering the window > size that was valid at the moment it was first transmitted. This works > much better. > > It should be noted that although this solves the problem we have at hand, > it is not a genuine solution to the kernel bug. There may well be TCP stacks > around in other OS-es which don't do this probing. > > Signed-off-by: Jon Maloy <jmaloy@redhat.com> > --- > tcp.c | 26 ++++++++++++++++---------- > tcp_conn.h | 2 ++ > 2 files changed, 18 insertions(+), 10 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 95d400a..9dea151 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, > ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; > > conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; > + conn->max_seq_to_tap = conn->seq_to_tap; > } > > /** > @@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > * > * #syscalls recvmsg > */ > -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled) > { > - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; > int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; > int sendlen, len, plen, v4 = CONN_V4(conn); > int s = conn->sock, i, ret = 0; > @@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > return 0; > } > + sendlen = len; > + if (!peek_offset_cap) > + sendlen -= already_sent; > > sendlen = len; > if (!peek_offset_cap) Looks like some duplicated lines from a bad rebase. > @@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > tcp_data_to_tap(c, conn, plen, no_csum, seq); > seq += plen; > } > - > + /* We need this to know this during retransmission: */ > + if (SEQ_GT(seq, conn->max_seq_to_tap)) > + conn->max_seq_to_tap = seq; > conn_flag(c, conn, ACK_FROM_TAP_DUE); > > return 0; > @@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > SEQ_GE(ack_seq, max_ack_seq)) { > /* Fast re-transmit */ > retr = !len && !th->fin && > - ack_seq == max_ack_seq && > - ntohs(th->window) == max_ack_seq_wnd; > + ack_seq == max_ack_seq; > > max_ack_seq_wnd = ntohs(th->window); > max_ack_seq = ack_seq; > @@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > flow_trace(conn, > "fast re-transmit, ACK: %u, previous sequence: %u", > max_ack_seq, conn->seq_to_tap); > + > conn->seq_ack_from_tap = max_ack_seq; > conn->seq_to_tap = max_ack_seq; > - tcp_data_from_sock(c, conn); > + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); Does the MAX do anything: if max_seq_to_tap equals seq_ack_from_tap then, by definition, there is nothing to retransmit - everything has been acked. > } > > if (!iov_i) > @@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, > /* The client might have sent data already, which we didn't > * dequeue waiting for SYN,ACK from tap -- check now. > */ > - tcp_data_from_sock(c, conn); > + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); > tcp_send_flag(c, conn, ACK); > } > > @@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, > > tcp_tap_window_update(conn, ntohs(th->window)); > > - tcp_data_from_sock(c, conn); > + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); > > if (p->count - idx == 1) > return 1; > @@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) > flow_dbg(conn, "ACK timeout, retry"); > conn->retrans++; > conn->seq_to_tap = conn->seq_ack_from_tap; > - tcp_data_from_sock(c, conn); > + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); > tcp_timer_ctl(c, conn); > } > } else { > @@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) > tcp_rst(c, conn); > } > } > + Spurious change > } > > /** > @@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) > conn_event(c, conn, SOCK_FIN_RCVD); > > if (events & EPOLLIN) > - tcp_data_from_sock(c, conn); > + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); > > if (events & EPOLLOUT) > tcp_update_seqack_wnd(c, conn, 0, NULL); > diff --git a/tcp_conn.h b/tcp_conn.h > index a5f5cfe..afcdec9 100644 > --- a/tcp_conn.h > +++ b/tcp_conn.h > @@ -29,6 +29,7 @@ > * @wnd_from_tap: Last window size from tap, unscaled (as received) > * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) > * @seq_to_tap: Next sequence for packets to tap > + * @max_seq_to_tap: Next seq after highest ever sent. Needeed during retransmit > * @seq_ack_from_tap: Last ACK number received from tap > * @seq_from_tap: Next sequence for packets from tap (not actually sent) > * @seq_ack_to_tap: Last ACK number sent to tap > @@ -100,6 +101,7 @@ struct tcp_tap_conn { > uint16_t wnd_to_tap; > > uint32_t seq_to_tap; > + uint32_t max_seq_to_tap; > uint32_t seq_ack_from_tap; > uint32_t seq_from_tap; > uint32_t seq_ack_to_tap; -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson [-- Attachment #2: signature.asc --] [-- Type: application/pgp-signature, Size: 833 bytes --] ^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH 2/2] tcp: allow retransmit when peer receive window is zero 2024-04-24 1:04 ` David Gibson @ 2024-04-24 18:31 ` Stefano Brivio 0 siblings, 0 replies; 15+ messages in thread From: Stefano Brivio @ 2024-04-24 18:31 UTC (permalink / raw) To: Jon Maloy; +Cc: David Gibson, passt-dev, lvivier, dgibson On top of David's comments: On Wed, 24 Apr 2024 11:04:51 +1000 David Gibson <david@gibson.dropbear.id.au> wrote: > On Sat, Apr 20, 2024 at 03:19:20PM -0400, Jon Maloy wrote: > > A bug in kernel TCP may lead to a deadlock where a zero window is sent > > from the peer, while buffer reads doesn't lead to it being updated. > > At the same time, the zero window stops this side from sending out > > more data to trigger new advertisements to be sent from the peer. > > This is a bit confusing to me. I guess the buffer reads must be > happening on the peer, not "this side", but that's not very clear. I > think "received from the peer" might be better than "sent from the > peer" to make it more obvious that the point of view is uniformly from > "this side". > > > RFC 793 states that it always is permitted for a sender to send one byte > > of data even when the window is zero. As incredible as it might sound, RFC 793 is now obsolete :) -- you should refer to RFC 9293 instead. Further, RFC 9293 3.8.6.1 (just like RFC 793) says that we *must* (albeit not MUST) "regularly" send that octet -- as long as we have one available. Not that it's permitted. Hence, a general comment on this patch: my understanding is that we want to implement zero-window probing, instead of triggering a fast re-transmit whenever we get a keep-alive packet from the peer, because we don't know if we'll get one. RFC 9293 3.8.6.1 goes on saying: The transmitting host SHOULD send the first zero-window probe when a zero window has existed for the retransmission timeout period (SHLD-29) (Section 3.8.1), and SHOULD increase exponentially the interval between successive probes (SHLD-30). but we currently don't compute the retransmission timeout according to RFC 6298 (yeah, I know...). Given that it's a SHOULD, and complying with that is clearly beyond the scope of this change, we can just use one of the existing timeouts and timers (ACK_TIMEOUT would do, for the moment). > > This resolves the deadlock described > > above, so we choose to introduce it here as a last resort. We allow it both > > during fast and as keep-alives when the timer sees no activity on > > the connection. > > Looks like there's a missing noun after "during fast". > > > However, we notice that this solution doesn´t work well. Traffic sometimes > > goes to zero, and onley recovers after the timer has resolved the situation. > > s/onley/only/ > > > Because of this, we chose to improve it slightly: The deadlock happens when a > > Is it actually useful to describe the "bad" workaround if we're using > a different one? I don't see how the better workaround is an obvious > extension of the bad one. > > > packet has been dropped at the peer end because of memory squeeze. We therefore > > consider it legitimate to retransmit that packet while considering the window > > size that was valid at the moment it was first transmitted. This works > > much better. > > > > It should be noted that although this solves the problem we have at hand, > > it is not a genuine solution to the kernel bug. There may well be TCP stacks > > around in other OS-es which don't do this probing. > > > > Signed-off-by: Jon Maloy <jmaloy@redhat.com> > > --- > > tcp.c | 26 ++++++++++++++++---------- > > tcp_conn.h | 2 ++ > > 2 files changed, 18 insertions(+), 10 deletions(-) > > > > diff --git a/tcp.c b/tcp.c > > index 95d400a..9dea151 100644 > > --- a/tcp.c > > +++ b/tcp.c > > @@ -1774,6 +1774,7 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_tap_conn *conn, > > ns = (now->tv_sec * 1000000000 + now->tv_nsec) >> 5; > > > > conn->seq_to_tap = ((uint32_t)(hash >> 32) ^ (uint32_t)hash) + ns; > > + conn->max_seq_to_tap = conn->seq_to_tap; > > } > > > > /** > > @@ -2123,9 +2124,8 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > > * > > * #syscalls recvmsg > > */ > > -static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn, uint32_t wnd_scaled) This exceeds 80 columns for no particularly good reason, and the function comment should be updated. > > { > > - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; > > int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; > > int sendlen, len, plen, v4 = CONN_V4(conn); > > int s = conn->sock, i, ret = 0; > > @@ -2212,6 +2212,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > > > return 0; > > } > > + sendlen = len; > > + if (!peek_offset_cap) > > + sendlen -= already_sent; > > > > sendlen = len; > > if (!peek_offset_cap) > > Looks like some duplicated lines from a bad rebase. > > > @@ -2241,7 +2244,9 @@ static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > > tcp_data_to_tap(c, conn, plen, no_csum, seq); > > seq += plen; > > } > > - > > + /* We need this to know this during retransmission: */ s/this to know this/to know this/ (I guess) > > + if (SEQ_GT(seq, conn->max_seq_to_tap)) > > + conn->max_seq_to_tap = seq; > > conn_flag(c, conn, ACK_FROM_TAP_DUE); > > > > return 0; > > @@ -2317,8 +2322,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > > SEQ_GE(ack_seq, max_ack_seq)) { > > /* Fast re-transmit */ > > retr = !len && !th->fin && > > - ack_seq == max_ack_seq && > > - ntohs(th->window) == max_ack_seq_wnd; > > + ack_seq == max_ack_seq; This matches any keep-alive (that is, a segment without data and without a window update) we get: I think you should replace the existing condition with a check that the window is zero, instead. > > > > max_ack_seq_wnd = ntohs(th->window); > > max_ack_seq = ack_seq; > > @@ -2385,9 +2389,10 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, > > flow_trace(conn, > > "fast re-transmit, ACK: %u, previous sequence: %u", > > max_ack_seq, conn->seq_to_tap); > > + Spurious change. > > conn->seq_ack_from_tap = max_ack_seq; > > conn->seq_to_tap = max_ack_seq; > > - tcp_data_from_sock(c, conn); > > + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); > > Does the MAX do anything: if max_seq_to_tap equals seq_ack_from_tap > then, by definition, there is nothing to retransmit - everything has > been acked. > > > } > > > > if (!iov_i) > > @@ -2483,7 +2488,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, > > /* The client might have sent data already, which we didn't > > * dequeue waiting for SYN,ACK from tap -- check now. > > */ > > - tcp_data_from_sock(c, conn); > > + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); > > tcp_send_flag(c, conn, ACK); > > } > > > > @@ -2575,7 +2580,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, > > > > tcp_tap_window_update(conn, ntohs(th->window)); > > > > - tcp_data_from_sock(c, conn); > > + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); > > > > if (p->count - idx == 1) > > return 1; > > @@ -2788,7 +2793,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) > > flow_dbg(conn, "ACK timeout, retry"); > > conn->retrans++; > > conn->seq_to_tap = conn->seq_ack_from_tap; > > - tcp_data_from_sock(c, conn); > > + tcp_data_from_sock(c, conn, MAX(1, conn->max_seq_to_tap - conn->seq_ack_from_tap)); > > tcp_timer_ctl(c, conn); > > } > > } else { > > @@ -2807,6 +2812,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) > > tcp_rst(c, conn); > > } > > } > > + > > Spurious change > > > } > > > > /** > > @@ -2843,7 +2849,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) > > conn_event(c, conn, SOCK_FIN_RCVD); > > > > if (events & EPOLLIN) > > - tcp_data_from_sock(c, conn); > > + tcp_data_from_sock(c, conn, conn->wnd_from_tap << conn->ws_from_tap); > > > > if (events & EPOLLOUT) > > tcp_update_seqack_wnd(c, conn, 0, NULL); > > diff --git a/tcp_conn.h b/tcp_conn.h > > index a5f5cfe..afcdec9 100644 > > --- a/tcp_conn.h > > +++ b/tcp_conn.h > > @@ -29,6 +29,7 @@ > > * @wnd_from_tap: Last window size from tap, unscaled (as received) > > * @wnd_to_tap: Sending window advertised to tap, unscaled (as sent) > > * @seq_to_tap: Next sequence for packets to tap > > + * @max_seq_to_tap: Next seq after highest ever sent. Needeed during retransmit Sequence numbers regularly wrap around (they are 32 bits), so you can't really define this. I'm not entirely sure how you use it, though -- I have the same question about the usage of MAX() above. > > * @seq_ack_from_tap: Last ACK number received from tap > > * @seq_from_tap: Next sequence for packets from tap (not actually sent) > > * @seq_ack_to_tap: Last ACK number sent to tap > > @@ -100,6 +101,7 @@ struct tcp_tap_conn { > > uint16_t wnd_to_tap; > > > > uint32_t seq_to_tap; > > + uint32_t max_seq_to_tap; > > uint32_t seq_ack_from_tap; > > uint32_t seq_from_tap; > > uint32_t seq_ack_to_tap; > -- Stefano ^ permalink raw reply [flat|nested] 15+ messages in thread
end of thread, other threads:[~2024-04-29 1:46 UTC | newest] Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2024-04-20 19:19 [PATCH 0/2] Support for SO_PEEK_OFF when a available Jon Maloy 2024-04-20 19:19 ` [PATCH 1/2] tcp: leverage support of SO_PEEK_OFF socket option when available Jon Maloy 2024-04-23 17:50 ` Stefano Brivio 2024-04-24 0:48 ` David Gibson 2024-04-24 18:30 ` Stefano Brivio 2024-04-26 3:27 ` David Gibson 2024-04-26 5:58 ` Stefano Brivio 2024-04-29 1:46 ` David Gibson 2024-04-25 23:06 ` Jon Maloy 2024-04-24 0:44 ` David Gibson 2024-04-25 23:23 ` Jon Maloy 2024-04-26 3:29 ` David Gibson 2024-04-20 19:19 ` [PATCH 2/2] tcp: allow retransmit when peer receive window is zero Jon Maloy 2024-04-24 1:04 ` David Gibson 2024-04-24 18:31 ` Stefano Brivio
Code repositories for project(s) associated with this public inbox https://passt.top/passt This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for IMAP folder(s).