On Fri, Feb 02, 2024 at 03:11:35PM +0100, Laurent Vivier wrote: This definitely needs a commit message to explain what you're trying to achieve here. Without further context "buffer management" suggests to me allocation / freeing / placement of buffers, whereas what's actually being moved here is code related to the construction of headers within the buffers. > Signed-off-by: Laurent Vivier > --- > tcp.c | 224 +++++++++++++++++++++++++++++++++------------------------- > 1 file changed, 129 insertions(+), 95 deletions(-) > > diff --git a/tcp.c b/tcp.c > index 2fd6bc2eda53..20ad8a4e5271 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -1320,87 +1320,98 @@ void tcp_defer_handler(struct ctx *c) > tcp_l2_data_buf_flush(c); > } > Function comment. > +static void tcp_set_tcp_header(struct tcphdr *th, > + const struct tcp_tap_conn *conn, uint32_t seq) > +{ > + th->source = htons(conn->fport); > + th->dest = htons(conn->eport); > + th->seq = htonl(seq); > + th->ack_seq = htonl(conn->seq_ack_to_tap); > + if (conn->events & ESTABLISHED) { > + th->window = htons(conn->wnd_to_tap); > + } else { > + unsigned wnd = conn->wnd_to_tap << conn->ws_to_tap; > + > + th->window = htons(MIN(wnd, USHRT_MAX)); > + } > +} > + > /** > - * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers > + * ipv4_fill_headers() - Fill 802.3, IPv4, TCP headers in pre-cooked buffers > * @c: Execution context > * @conn: Connection pointer > - * @p: Pointer to any type of TCP pre-cooked buffer > + * @iph: Pointer to IPv4 header, immediately followed by a TCP header > * @plen: Payload length (including TCP header options) > * @check: Checksum, if already known > * @seq: Sequence number for this segment > * > - * Return: frame length including L2 headers, host order > + * Return: IP frame length including L2 headers, host order This is an odd case where adding an extra descriptor is making it less clear to me. Without context, "frame" suggests to me an entire L2 frame containing whatever inner protocol. But I'm not immediately sure what "IP frame" means: is it an L2 frame which happens to have IP inside, or does it mean just the IP packet without the L2 headers. The rest of the sentence clarifies it's the first, but it still throws me for a moment. > */ > -static size_t tcp_l2_buf_fill_headers(const struct ctx *c, > - const struct tcp_tap_conn *conn, > - void *p, size_t plen, > - const uint16_t *check, uint32_t seq) > + > +static size_t ipv4_fill_headers(const struct ctx *c, > + const struct tcp_tap_conn *conn, > + struct iphdr *iph, size_t plen, > + const uint16_t *check, uint32_t seq) I really like this re-organization of the header filling code. I wonder if separating it into a separate patch might make the remainder patch easier to follow. > { > + struct tcphdr *th = (void *)(iph + 1); > const struct in_addr *a4 = inany_v4(&conn->faddr); > - size_t ip_len, tlen; > - > -#define SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq) \ > -do { \ > - b->th.source = htons(conn->fport); \ > - b->th.dest = htons(conn->eport); \ > - b->th.seq = htonl(seq); \ > - b->th.ack_seq = htonl(conn->seq_ack_to_tap); \ > - if (conn->events & ESTABLISHED) { \ > - b->th.window = htons(conn->wnd_to_tap); \ > - } else { \ > - unsigned wnd = conn->wnd_to_tap << conn->ws_to_tap; \ > - \ > - b->th.window = htons(MIN(wnd, USHRT_MAX)); \ > - } \ > -} while (0) > - > - if (a4) { > - struct tcp4_l2_buf_t *b = (struct tcp4_l2_buf_t *)p; > - > - ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); > - b->iph.tot_len = htons(ip_len); > - b->iph.saddr = a4->s_addr; > - b->iph.daddr = c->ip4.addr_seen.s_addr; > - > - b->iph.check = check ? *check : > - ipv4_hdr_checksum(&b->iph, IPPROTO_TCP); > - > - SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq); > - > - b->th.check = tcp_update_check_tcp4(&b->iph); > - > - tlen = tap_iov_len(c, &b->taph, ip_len); > - } else { > - struct tcp6_l2_buf_t *b = (struct tcp6_l2_buf_t *)p; > + size_t ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr); > > - ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); > + iph->tot_len = htons(ip_len); > + iph->saddr = a4->s_addr; > + iph->daddr = c->ip4.addr_seen.s_addr; > > - b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr)); > - b->ip6h.saddr = conn->faddr.a6; > - if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr)) > - b->ip6h.daddr = c->ip6.addr_ll_seen; > - else > - b->ip6h.daddr = c->ip6.addr_seen; > + iph->check = check ? *check : ipv4_hdr_checksum(iph, IPPROTO_TCP); > + > + tcp_set_tcp_header(th, conn, seq); > + > + th->check = tcp_update_check_tcp4(iph); > + > + return ip_len; > +} > + > +/** > + * ipv6_fill_headers() - Fill 802.3, IPv6, TCP headers in pre-cooked buffers > + * @c: Execution context > + * @conn: Connection pointer > + * @ip6h: Pointer to IPv6 header, immediately followed by a TCP header > + * @plen: Payload length (including TCP header options) > + * @check: Checksum, if already known > + * @seq: Sequence number for this segment > + * > + * Return: IP frame length including L2 headers, host order > + */ > + > +static size_t ipv6_fill_headers(const struct ctx *c, > + const struct tcp_tap_conn *conn, > + struct ipv6hdr *ip6h, size_t plen, > + uint32_t seq) > +{ > + struct tcphdr *th = (void *)(ip6h + 1); > + size_t ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr); > > - memset(b->ip6h.flow_lbl, 0, 3); > + ip6h->payload_len = htons(plen + sizeof(struct tcphdr)); > + ip6h->saddr = conn->faddr.a6; > + if (IN6_IS_ADDR_LINKLOCAL(&ip6h->saddr)) > + ip6h->daddr = c->ip6.addr_ll_seen; > + else > + ip6h->daddr = c->ip6.addr_seen; > > - SET_TCP_HEADER_COMMON_V4_V6(b, conn, seq); > + memset(ip6h->flow_lbl, 0, 3); > > - b->th.check = tcp_update_check_tcp6(&b->ip6h); > + tcp_set_tcp_header(th, conn, seq); > > - b->ip6h.hop_limit = 255; > - b->ip6h.version = 6; > - b->ip6h.nexthdr = IPPROTO_TCP; > + th->check = tcp_update_check_tcp6(ip6h); > > - b->ip6h.flow_lbl[0] = (conn->sock >> 16) & 0xf; > - b->ip6h.flow_lbl[1] = (conn->sock >> 8) & 0xff; > - b->ip6h.flow_lbl[2] = (conn->sock >> 0) & 0xff; > + ip6h->hop_limit = 255; > + ip6h->version = 6; > + ip6h->nexthdr = IPPROTO_TCP; > > - tlen = tap_iov_len(c, &b->taph, ip_len); > - } > -#undef SET_TCP_HEADER_COMMON_V4_V6 > + ip6h->flow_lbl[0] = (conn->sock >> 16) & 0xf; > + ip6h->flow_lbl[1] = (conn->sock >> 8) & 0xff; > + ip6h->flow_lbl[2] = (conn->sock >> 0) & 0xff; > > - return tlen; > + return ip_len; > } > > /** > @@ -1520,27 +1531,21 @@ static void tcp_update_seqack_from_tap(const struct ctx *c, > } > > /** > - * tcp_send_flag() - Send segment with flags to tap (no payload) > + * do_tcp_send_flag() - Send segment with flags to tap (no payload) As a rule, I don't love "do_X" as a function name. I particularly dislike it here because AFAICT this function isn't actually the one that "does" the sending of the flag - that happens with the tcp_l2_flags_buf_flush() in the caller. > * @c: Execution context > * @conn: Connection pointer > * @flags: TCP flags: if not set, send segment only if ACK is due > * > * Return: negative error code on connection reset, 0 otherwise > */ > -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > + > +static int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags, struct tcphdr *th, char *data, size_t optlen) > { > uint32_t prev_ack_to_tap = conn->seq_ack_to_tap; > uint32_t prev_wnd_to_tap = conn->wnd_to_tap; > - struct tcp4_l2_flags_buf_t *b4 = NULL; > - struct tcp6_l2_flags_buf_t *b6 = NULL; > struct tcp_info tinfo = { 0 }; > socklen_t sl = sizeof(tinfo); > int s = conn->sock; > - size_t optlen = 0; > - struct iovec *iov; > - struct tcphdr *th; > - char *data; > - void *p; > > if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) && > !flags && conn->wnd_to_tap) > @@ -1562,26 +1567,9 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags) > return 0; > > - if (CONN_V4(conn)) { > - iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; > - p = b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; > - th = &b4->th; > - > - /* gcc 11.2 would complain on data = (char *)(th + 1); */ > - data = b4->opts; > - } else { > - iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; > - p = b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; > - th = &b6->th; > - data = b6->opts; > - } > - > if (flags & SYN) { > int mss; > > - /* Options: MSS, NOP and window scale (8 bytes) */ > - optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; > - > *data++ = OPT_MSS; > *data++ = OPT_MSS_LEN; > > @@ -1624,9 +1612,6 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > th->syn = !!(flags & SYN); > th->fin = !!(flags & FIN); > > - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, p, optlen, > - NULL, conn->seq_to_tap); > - > if (th->ack) { > if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap)) > conn_flag(c, conn, ~ACK_TO_TAP_DUE); > @@ -1641,8 +1626,38 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > if (th->fin || th->syn) > conn->seq_to_tap++; > > + return 1; > +} > + > +static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > +{ > + size_t optlen = 0; > + struct iovec *iov; > + size_t ip_len; > + int ret; > + > + /* Options: MSS, NOP and window scale (8 bytes) */ > + if (flags & SYN) > + optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; > + > if (CONN_V4(conn)) { > + struct tcp4_l2_flags_buf_t *b4; > + > + iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; > + b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; > + > + ret = do_tcp_send_flag(c, conn, flags, &b4->th, b4->opts, > + optlen); > + if (ret <= 0) > + return ret; > + > + ip_len = ipv4_fill_headers(c, conn, &b4->iph, optlen, > + NULL, conn->seq_to_tap); > + > + iov->iov_len = tap_iov_len(c, &b4->taph, ip_len); > + > if (flags & DUP_ACK) { > + > memcpy(b4 + 1, b4, sizeof(*b4)); > (iov + 1)->iov_len = iov->iov_len; > tcp4_l2_flags_buf_used++; > @@ -1651,6 +1666,21 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) > tcp_l2_flags_buf_flush(c); > } else { > + struct tcp6_l2_flags_buf_t *b6; > + > + iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; > + b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; > + > + ret = do_tcp_send_flag(c, conn, flags, &b6->th, b6->opts, > + optlen); > + if (ret <= 0) > + return ret; > + > + ip_len = ipv6_fill_headers(c, conn, &b6->ip6h, optlen, > + conn->seq_to_tap); > + > + iov->iov_len = tap_iov_len(c, &b6->taph, ip_len); > + > if (flags & DUP_ACK) { > memcpy(b6 + 1, b6, sizeof(*b6)); > (iov + 1)->iov_len = iov->iov_len; > @@ -2050,6 +2080,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > { > uint32_t *seq_update = &conn->seq_to_tap; > struct iovec *iov; > + size_t ip_len; > > if (CONN_V4(conn)) { > struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; > @@ -2058,9 +2089,11 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; > tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; > > + ip_len = ipv4_fill_headers(c, conn, &b->iph, plen, > + check, seq); > + > iov = tcp4_l2_iov + tcp4_l2_buf_used++; > - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, > - check, seq); > + iov->iov_len = tap_iov_len(c, &b->taph, ip_len); > if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) > tcp_l2_data_buf_flush(c); > } else if (CONN_V6(conn)) { > @@ -2069,9 +2102,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, > tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; > tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; > > + ip_len = ipv6_fill_headers(c, conn, &b->ip6h, plen, seq); > + > iov = tcp6_l2_iov + tcp6_l2_buf_used++; > - iov->iov_len = tcp_l2_buf_fill_headers(c, conn, b, plen, > - NULL, seq); > + iov->iov_len = tap_iov_len(c, &b->taph, ip_len); > if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) > tcp_l2_data_buf_flush(c); > } -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson