// SPDX-License-Identifier: GPL-2.0-or-later /* PASST - Plug A Simple Socket Transport * for qemu/UNIX domain socket mode * * PASTA - Pack A Subtle Tap Abstraction * for network namespace/tap device mode * * tcp_buf.c - TCP L2-L4 translation state machine * * Copyright (c) 2020-2022 Red Hat GmbH * Author: Stefano Brivio */ #include #include #include #include #include #include #include #include "util.h" #include "ip.h" #include "passt.h" #include "tap.h" #include "siphash.h" #include "inany.h" #include "tcp_conn.h" #include "tcp_internal.h" #include "tcp_buf.h" #define TCP_FRAMES_MEM 128 #define TCP_FRAMES \ (c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM) struct tcp4_l2_head { /* For MSS4 macro: keep in sync with tcp4_l2_buf_t */ #ifdef __AVX2__ uint8_t pad[26]; #else uint8_t pad[2]; #endif struct tap_hdr taph; struct iphdr iph; struct tcphdr th; #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))); #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif struct tcp6_l2_head { /* For MSS6 macro: keep in sync with tcp6_l2_buf_t */ #ifdef __AVX2__ uint8_t pad[14]; #else uint8_t pad[2]; #endif struct tap_hdr taph; struct ipv6hdr ip6h; struct tcphdr th; #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))); #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))); #endif #define MSS4 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp4_l2_head), 4) #define MSS6 ROUND_DOWN(USHRT_MAX - sizeof(struct tcp6_l2_head), 4) /** * tcp_buf_seq_update - Sequences to update with length of frames once sent * @seq: Pointer to sequence number sent to tap-side, to be updated * @len: TCP payload length */ struct tcp_buf_seq_update { uint32_t *seq; uint16_t len; }; /* Static buffers */ /** * tcp4_l2_buf_t - Pre-cooked IPv4 packet buffers for tap connections * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only * @taph: Tap-level headers (partially pre-filled) * @iph: Pre-filled IP header (except for tot_len and saddr) * @uh: Headroom for TCP header * @data: Storage for TCP payload */ static struct tcp4_l2_buf_t { #ifdef __AVX2__ uint8_t pad[26]; /* 0, align th to 32 bytes */ #else uint8_t pad[2]; /* align iph to 4 bytes 0 */ #endif struct tap_hdr taph; /* 26 2 */ struct iphdr iph; /* 44 20 */ struct tcphdr th; /* 64 40 */ uint8_t data[MSS4]; /* 84 60 */ /* 65536 65532 */ #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif tcp4_l2_buf[TCP_FRAMES_MEM]; static struct tcp_buf_seq_update tcp4_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp4_l2_buf_used; /** * tcp6_l2_buf_t - Pre-cooked IPv6 packet buffers for tap connections * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B * @taph: Tap-level headers (partially pre-filled) * @ip6h: Pre-filled IP header (except for payload_len and addresses) * @th: Headroom for TCP header * @data: Storage for TCP payload */ struct tcp6_l2_buf_t { #ifdef __AVX2__ uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ #else uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ #endif struct tap_hdr taph; /* 14 2 */ struct ipv6hdr ip6h; /* 32 20 */ struct tcphdr th; /* 72 60 */ uint8_t data[MSS6]; /* 92 80 */ /* 65536 65532 */ #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif tcp6_l2_buf[TCP_FRAMES_MEM]; static struct tcp_buf_seq_update tcp6_l2_buf_seq_update[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_buf_used; /* recvmsg()/sendmsg() data for tap */ static struct iovec iov_sock [TCP_FRAMES_MEM + 1]; static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM]; static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM]; static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM]; static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM]; /** * tcp4_l2_flags_buf_t - IPv4 packet buffers for segments without data (flags) * @pad: Align TCP header to 32 bytes, for AVX2 checksum calculation only * @taph: Tap-level headers (partially pre-filled) * @iph: Pre-filled IP header (except for tot_len and saddr) * @th: Headroom for TCP header * @opts: Headroom for TCP options */ static struct tcp4_l2_flags_buf_t { #ifdef __AVX2__ uint8_t pad[26]; /* 0, align th to 32 bytes */ #else uint8_t pad[2]; /* align iph to 4 bytes 0 */ #endif struct tap_hdr taph; /* 26 2 */ struct iphdr iph; /* 44 20 */ struct tcphdr th; /* 64 40 */ char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif tcp4_l2_flags_buf[TCP_FRAMES_MEM]; static unsigned int tcp4_l2_flags_buf_used; /** * tcp6_l2_flags_buf_t - IPv6 packet buffers for segments without data (flags) * @pad: Align IPv6 header for checksum calculation to 32B (AVX2) or 4B * @taph: Tap-level headers (partially pre-filled) * @ip6h: Pre-filled IP header (except for payload_len and addresses) * @th: Headroom for TCP header * @opts: Headroom for TCP options */ static struct tcp6_l2_flags_buf_t { #ifdef __AVX2__ uint8_t pad[14]; /* 0 align ip6h to 32 bytes */ #else uint8_t pad[2]; /* align ip6h to 4 bytes 0 */ #endif struct tap_hdr taph; /* 14 2 */ struct ipv6hdr ip6h; /* 32 20 */ struct tcphdr th /* 72 */ __attribute__ ((aligned(4))); /* 60 */ char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif tcp6_l2_flags_buf[TCP_FRAMES_MEM]; static unsigned int tcp6_l2_flags_buf_used; /** * tcp_buf_update_l2() - Update L2 buffers with Ethernet and IPv4 addresses * @eth_d: Ethernet destination address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged */ void tcp_buf_update_l2(const unsigned char *eth_d, const unsigned char *eth_s) { int i; for (i = 0; i < TCP_FRAMES_MEM; i++) { struct tcp4_l2_flags_buf_t *b4f = &tcp4_l2_flags_buf[i]; struct tcp6_l2_flags_buf_t *b6f = &tcp6_l2_flags_buf[i]; struct tcp4_l2_buf_t *b4 = &tcp4_l2_buf[i]; struct tcp6_l2_buf_t *b6 = &tcp6_l2_buf[i]; eth_update_mac(&b4->taph.eh, eth_d, eth_s); eth_update_mac(&b6->taph.eh, eth_d, eth_s); eth_update_mac(&b4f->taph.eh, eth_d, eth_s); eth_update_mac(&b6f->taph.eh, eth_d, eth_s); } } /** * tcp_buf_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets * @c: Execution context */ void tcp_buf_sock4_iov_init(const struct ctx *c) { struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP); struct iovec *iov; int i; for (i = 0; i < ARRAY_SIZE(tcp4_l2_buf); i++) { tcp4_l2_buf[i] = (struct tcp4_l2_buf_t) { .taph = TAP_HDR_INIT(ETH_P_IP), .iph = iph, .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } }; } for (i = 0; i < ARRAY_SIZE(tcp4_l2_flags_buf); i++) { tcp4_l2_flags_buf[i] = (struct tcp4_l2_flags_buf_t) { .taph = TAP_HDR_INIT(ETH_P_IP), .iph = L2_BUF_IP4_INIT(IPPROTO_TCP) }; } for (i = 0, iov = tcp4_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) iov->iov_base = tap_iov_base(c, &tcp4_l2_buf[i].taph); for (i = 0, iov = tcp4_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) iov->iov_base = tap_iov_base(c, &tcp4_l2_flags_buf[i].taph); } /** * tcp_buf_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets * @c: Execution context */ void tcp_buf_sock6_iov_init(const struct ctx *c) { struct iovec *iov; int i; for (i = 0; i < ARRAY_SIZE(tcp6_l2_buf); i++) { tcp6_l2_buf[i] = (struct tcp6_l2_buf_t) { .taph = TAP_HDR_INIT(ETH_P_IPV6), .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP), .th = { .doff = sizeof(struct tcphdr) / 4, .ack = 1 } }; } for (i = 0; i < ARRAY_SIZE(tcp6_l2_flags_buf); i++) { tcp6_l2_flags_buf[i] = (struct tcp6_l2_flags_buf_t) { .taph = TAP_HDR_INIT(ETH_P_IPV6), .ip6h = L2_BUF_IP6_INIT(IPPROTO_TCP) }; } for (i = 0, iov = tcp6_l2_iov; i < TCP_FRAMES_MEM; i++, iov++) iov->iov_base = tap_iov_base(c, &tcp6_l2_buf[i].taph); for (i = 0, iov = tcp6_l2_flags_iov; i < TCP_FRAMES_MEM; i++, iov++) iov->iov_base = tap_iov_base(c, &tcp6_l2_flags_buf[i].taph); } /** * tcp_buf_l2_flags_flush() - Send out buffers for segments with no data (flags) * @c: Execution context */ void tcp_buf_l2_flags_flush(const struct ctx *c) { tap_send_frames(c, tcp6_l2_flags_iov, tcp6_l2_flags_buf_used); tcp6_l2_flags_buf_used = 0; tap_send_frames(c, tcp4_l2_flags_iov, tcp4_l2_flags_buf_used); tcp4_l2_flags_buf_used = 0; } /** * tcp_buf_l2_data_flush() - Send out buffers for segments with data * @c: Execution context */ void tcp_buf_l2_data_flush(const struct ctx *c) { unsigned i; size_t m; m = tap_send_frames(c, tcp6_l2_iov, tcp6_l2_buf_used); for (i = 0; i < m; i++) *tcp6_l2_buf_seq_update[i].seq += tcp6_l2_buf_seq_update[i].len; tcp6_l2_buf_used = 0; m = tap_send_frames(c, tcp4_l2_iov, tcp4_l2_buf_used); for (i = 0; i < m; i++) *tcp4_l2_buf_seq_update[i].seq += tcp4_l2_buf_seq_update[i].len; tcp4_l2_buf_used = 0; } int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { size_t optlen = 0; struct iovec *iov; size_t ip_len; int ret; /* Options: MSS, NOP and window scale (8 bytes) */ if (flags & SYN) optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; if (CONN_V4(conn)) { struct tcp4_l2_flags_buf_t *b4; iov = tcp4_l2_flags_iov + tcp4_l2_flags_buf_used; b4 = tcp4_l2_flags_buf + tcp4_l2_flags_buf_used++; ret = do_tcp_send_flag(c, conn, flags, &b4->th, b4->opts, optlen); if (ret <= 0) return ret; ip_len = ipv4_fill_headers(c, conn, &b4->iph, optlen, NULL, conn->seq_to_tap); iov->iov_len = tap_iov_len(c, &b4->taph, ip_len); if (flags & DUP_ACK) { memcpy(b4 + 1, b4, sizeof(*b4)); (iov + 1)->iov_len = iov->iov_len; tcp4_l2_flags_buf_used++; } if (tcp4_l2_flags_buf_used > ARRAY_SIZE(tcp4_l2_flags_buf) - 2) tcp_buf_l2_flags_flush(c); } else { struct tcp6_l2_flags_buf_t *b6; iov = tcp6_l2_flags_iov + tcp6_l2_flags_buf_used; b6 = tcp6_l2_flags_buf + tcp6_l2_flags_buf_used++; ret = do_tcp_send_flag(c, conn, flags, &b6->th, b6->opts, optlen); if (ret <= 0) return ret; ip_len = ipv6_fill_headers(c, conn, &b6->ip6h, optlen, conn->seq_to_tap); iov->iov_len = tap_iov_len(c, &b6->taph, ip_len); if (flags & DUP_ACK) { memcpy(b6 + 1, b6, sizeof(*b6)); (iov + 1)->iov_len = iov->iov_len; tcp6_l2_flags_buf_used++; } if (tcp6_l2_flags_buf_used > ARRAY_SIZE(tcp6_l2_flags_buf) - 2) tcp_buf_l2_flags_flush(c); } return 0; } uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn) { if (CONN_V4(conn)) return MSS4; return MSS6; } /** * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer * @c: Execution context * @conn: Connection pointer * @plen: Payload length at L4 * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer * @seq: Sequence number to be sent */ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn, ssize_t plen, int no_csum, uint32_t seq) { uint32_t *seq_update = &conn->seq_to_tap; struct iovec *iov; size_t ip_len; if (CONN_V4(conn)) { struct tcp4_l2_buf_t *b = &tcp4_l2_buf[tcp4_l2_buf_used]; const uint16_t *check = no_csum ? &(b - 1)->iph.check : NULL; tcp4_l2_buf_seq_update[tcp4_l2_buf_used].seq = seq_update; tcp4_l2_buf_seq_update[tcp4_l2_buf_used].len = plen; ip_len = ipv4_fill_headers(c, conn, &b->iph, plen, check, seq); iov = tcp4_l2_iov + tcp4_l2_buf_used++; iov->iov_len = tap_iov_len(c, &b->taph, ip_len); if (tcp4_l2_buf_used > ARRAY_SIZE(tcp4_l2_buf) - 1) tcp_buf_l2_data_flush(c); } else if (CONN_V6(conn)) { struct tcp6_l2_buf_t *b = &tcp6_l2_buf[tcp6_l2_buf_used]; tcp6_l2_buf_seq_update[tcp6_l2_buf_used].seq = seq_update; tcp6_l2_buf_seq_update[tcp6_l2_buf_used].len = plen; ip_len = ipv6_fill_headers(c, conn, &b->ip6h, plen, seq); iov = tcp6_l2_iov + tcp6_l2_buf_used++; iov->iov_len = tap_iov_len(c, &b->taph, ip_len); if (tcp6_l2_buf_used > ARRAY_SIZE(tcp6_l2_buf) - 1) tcp_buf_l2_data_flush(c); } } /** * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window * @c: Execution context * @conn: Connection pointer * * Return: negative on connection reset, 0 otherwise * * #syscalls recvmsg */ int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; int fill_bufs, send_bufs = 0, last_len, iov_rem = 0; int sendlen, len, plen, v4 = CONN_V4(conn); int s = conn->sock, i, ret = 0; struct msghdr mh_sock = { 0 }; uint16_t mss = MSS_GET(conn); uint32_t already_sent, seq; struct iovec *iov; already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; if (SEQ_LT(already_sent, 0)) { /* RFC 761, section 2.1. */ flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; } if (!wnd_scaled || already_sent >= wnd_scaled) { conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } /* Set up buffer descriptors we'll fill completely and partially. */ fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss); if (fill_bufs > TCP_FRAMES) { fill_bufs = TCP_FRAMES; iov_rem = 0; } else { iov_rem = (wnd_scaled - already_sent) % mss; } mh_sock.msg_iov = iov_sock; mh_sock.msg_iovlen = fill_bufs + 1; iov_sock[0].iov_base = tcp_buf_discard; iov_sock[0].iov_len = already_sent; if (( v4 && tcp4_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp4_l2_buf)) || (!v4 && tcp6_l2_buf_used + fill_bufs > ARRAY_SIZE(tcp6_l2_buf))) { tcp_buf_l2_data_flush(c); /* Silence Coverity CWE-125 false positive */ tcp4_l2_buf_used = tcp6_l2_buf_used = 0; } for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) { if (v4) iov->iov_base = &tcp4_l2_buf[tcp4_l2_buf_used + i].data; else iov->iov_base = &tcp6_l2_buf[tcp6_l2_buf_used + i].data; iov->iov_len = mss; } if (iov_rem) iov_sock[fill_bufs].iov_len = iov_rem; /* Receive into buffers, don't dequeue until acknowledged by guest. */ do len = recvmsg(s, &mh_sock, MSG_PEEK); while (len < 0 && errno == EINTR); if (len < 0) goto err; if (!len) { if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) { tcp_rst(c, conn); return ret; } conn_event(c, conn, TAP_FIN_SENT); } return 0; } sendlen = len - already_sent; if (sendlen <= 0) { conn_flag(c, conn, STALLED); return 0; } conn_flag(c, conn, ~STALLED); send_bufs = DIV_ROUND_UP(sendlen, mss); last_len = sendlen - (send_bufs - 1) * mss; /* Likely, some new data was acked too. */ tcp_update_seqack_wnd(c, conn, 0, NULL); /* Finally, queue to tap */ plen = mss; seq = conn->seq_to_tap; for (i = 0; i < send_bufs; i++) { int no_csum = i && i != send_bufs - 1 && tcp4_l2_buf_used; if (i == send_bufs - 1) plen = last_len; tcp_data_to_tap(c, conn, plen, no_csum, seq); seq += plen; } conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; err: if (errno != EAGAIN && errno != EWOULDBLOCK) { ret = -errno; tcp_rst(c, conn); } return ret; }