From: Stefano Brivio <sbrivio@redhat.com>
To: Laurent Vivier <lvivier@redhat.com>
Cc: passt-dev@passt.top
Subject: Re: [PATCH v6 2/8] tcp: move buffers management functions to their own file
Date: Wed, 12 Jun 2024 17:54:05 +0200 [thread overview]
Message-ID: <20240612175405.1711bc90@elisabeth> (raw)
In-Reply-To: <20240612154734.1044883-3-lvivier@redhat.com>
On Wed, 12 Jun 2024 17:47:28 +0200
Laurent Vivier <lvivier@redhat.com> wrote:
> Move all the TCP parts using internal buffers to tcp_buf.c
> and keep generic TCP management functions in tcp.c.
> Add tcp_internal.h to export needed functions from tcp.c and
> tcp_buf.h from tcp_buf.c
>
> With this change we can use existing TCP functions with a
> different kind of memory storage as for instance the shared
> memory provided by the guest via vhost-user.
>
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>
> Notes:
> v5:
> - as we export now tcp_l2_buf_fill_headers() move
> also enum tcp_iov_part to tcp_internal.h
>
> v4:
> - rename tcp_send_flag() and tcp_data_from_sock() to
> tcp_buf_send_flag() and tcp_buf_data_from_sock()
>
> Makefile | 5 +-
> tcp.c | 562 ++-----------------------------------------------
> tcp_buf.c | 513 ++++++++++++++++++++++++++++++++++++++++++++
> tcp_buf.h | 16 ++
> tcp_internal.h | 96 +++++++++
> 5 files changed, 648 insertions(+), 544 deletions(-)
> create mode 100644 tcp_buf.c
> create mode 100644 tcp_buf.h
> create mode 100644 tcp_internal.h
>
> diff --git a/Makefile b/Makefile
> index e2180b599bdb..09fc461d087e 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
> PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
> icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
> ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
> - tcp_splice.c udp.c util.c
> + tcp_buf.c tcp_splice.c udp.c util.c
> QRAP_SRCS = qrap.c
> SRCS = $(PASST_SRCS) $(QRAP_SRCS)
>
> @@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
> PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
> flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
> lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
> - siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h
> + siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
> + udp.h util.h
> HEADERS = $(PASST_HEADERS) seccomp.h
>
> C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
> diff --git a/tcp.c b/tcp.c
> index 6800209d4122..875e318c925b 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -302,28 +302,14 @@
> #include "flow.h"
>
> #include "flow_table.h"
> -
> -#define TCP_FRAMES_MEM 128
> -#define TCP_FRAMES \
> - (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
> +#include "tcp_internal.h"
> +#include "tcp_buf.h"
>
> #define TCP_HASH_TABLE_LOAD 70 /* % */
> #define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
>
> -#define MAX_WS 8
> -#define MAX_WINDOW (1 << (16 + (MAX_WS)))
> -
> /* MSS rounding: see SET_MSS() */
> #define MSS_DEFAULT 536
> -#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
> - sizeof(struct tcphdr) - \
> - sizeof(struct iphdr), \
> - sizeof(uint32_t))
> -#define MSS6 ROUND_DOWN(IP_MAX_MTU - \
> - sizeof(struct tcphdr) - \
> - sizeof(struct ipv6hdr), \
> - sizeof(uint32_t))
> -
> #define WINDOW_DEFAULT 14600 /* RFC 6928 */
> #ifdef HAS_SND_WND
> # define KERNEL_REPORTS_SND_WND(c) ((c)->tcp.kernel_snd_wnd)
> @@ -345,33 +331,10 @@
> */
> #define SOL_TCP IPPROTO_TCP
>
> -#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
> -#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
> -#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
> -#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
> -
> -#define FIN (1 << 0)
> -#define SYN (1 << 1)
> -#define RST (1 << 2)
> -#define ACK (1 << 4)
> -/* Flags for internal usage */
> -#define DUP_ACK (1 << 5)
> #define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
>
> -#define OPT_EOL 0
> -#define OPT_NOP 1
> -#define OPT_MSS 2
> -#define OPT_MSS_LEN 4
> -#define OPT_WS 3
> -#define OPT_WS_LEN 3
> -#define OPT_SACKP 4
> -#define OPT_SACK 5
> -#define OPT_TS 8
> -
> #define TAPSIDE(conn_) ((conn_)->f.pif[1] == PIF_TAP)
>
> -#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
> -#define CONN_V6(conn) (!CONN_V4(conn))
> #define CONN_IS_CLOSING(conn) \
> (((conn)->events & ESTABLISHED) && \
> ((conn)->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
> @@ -408,106 +371,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
> */
> static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
>
> -/* Static buffers */
> -/**
> - * struct tcp_payload_t - TCP header and data to send segments with payload
> - * @th: TCP header
> - * @data: TCP data
> - */
> -struct tcp_payload_t {
> - struct tcphdr th;
> - uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -/**
> - * struct tcp_flags_t - TCP header and data to send zero-length
> - * segments (flags)
> - * @th: TCP header
> - * @opts TCP options
> - */
> -struct tcp_flags_t {
> - struct tcphdr th;
> - char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> -#ifdef __AVX2__
> -} __attribute__ ((packed, aligned(32)));
> -#else
> -} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> -#endif
> -
> -/* Ethernet header for IPv4 frames */
> -static struct ethhdr tcp4_eth_src;
> -
> -static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
> -/* IPv4 headers */
> -static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
> -/* TCP segments with payload for IPv4 frames */
> -static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
> -
> -static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
> -
> -/* References tracking the owner connection of frames in the tap outqueue */
> -static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
> -static unsigned int tcp4_payload_used;
> -
> -static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
> -/* IPv4 headers for TCP segment without payload */
> -static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
> -/* TCP segments without payload for IPv4 frames */
> -static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp4_flags_used;
> -
> -/* Ethernet header for IPv6 frames */
> -static struct ethhdr tcp6_eth_src;
> -
> -static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
> -/* IPv6 headers */
> -static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
> -/* TCP headers and data for IPv6 frames */
> -static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
> -
> -static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
> -
> -/* References tracking the owner connection of frames in the tap outqueue */
> -static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
> -static unsigned int tcp6_payload_used;
> -
> -static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
> -/* IPv6 headers for TCP segment without payload */
> -static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
> -/* TCP segment without payload for IPv6 frames */
> -static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
> -
> -static unsigned int tcp6_flags_used;
> -
> -/* recvmsg()/sendmsg() data for tap */
> -static char tcp_buf_discard [MAX_WINDOW];
> -static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
> -
> -/*
> - * enum tcp_iov_parts - I/O vector parts for one TCP frame
> - * @TCP_IOV_TAP tap backend specific header
> - * @TCP_IOV_ETH Ethernet header
> - * @TCP_IOV_IP IP (v4/v6) header
> - * @TCP_IOV_PAYLOAD IP payload (TCP header + data)
> - * @TCP_NUM_IOVS the number of entries in the iovec array
> - */
> -enum tcp_iov_parts {
> - TCP_IOV_TAP = 0,
> - TCP_IOV_ETH = 1,
> - TCP_IOV_IP = 2,
> - TCP_IOV_PAYLOAD = 3,
> - TCP_NUM_IOVS
> -};
> -
> -static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
> -static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
> -static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
> -static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +char tcp_buf_discard [MAX_WINDOW];
>
> /* sendmsg() to socket */
> static struct iovec tcp_iov [UIO_MAXIOV];
> @@ -552,14 +416,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
> return EPOLLRDHUP;
> }
>
> -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> - unsigned long flag);
> -#define conn_flag(c, conn, flag) \
> - do { \
> - flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
> - conn_flag_do(c, conn, flag); \
> - } while (0)
> -
> /**
> * tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
> * @c: Execution context
> @@ -671,8 +527,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
> * @conn: Connection pointer
> * @flag: Flag to set, or ~flag to unset
> */
> -static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> - unsigned long flag)
> +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> + unsigned long flag)
> {
> if (flag & (flag - 1)) {
> int flag_index = fls(~flag);
> @@ -722,8 +578,8 @@ static void tcp_hash_remove(const struct ctx *c,
> * @conn: Connection pointer
> * @event: Connection event
> */
> -static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
> - unsigned long event)
> +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
> + unsigned long event)
> {
> int prev, new, num = fls(event);
>
> @@ -771,12 +627,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
> tcp_timer_ctl(c, conn);
> }
>
> -#define conn_event(c, conn, event) \
> - do { \
> - flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
> - conn_event_do(c, conn, event); \
> - } while (0)
> -
> /**
> * tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
> * @conn: Connection pointer
> @@ -906,104 +756,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
> th->check = csum(th, l4len, sum);
> }
>
> -/**
> - * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
> - * @eth_d: Ethernet destination address, NULL if unchanged
> - * @eth_s: Ethernet source address, NULL if unchanged
> - */
> -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> -{
> - eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
> - eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
> -}
> -
> -/**
> - * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
> - * @c: Execution context
> - */
> -static void tcp_sock4_iov_init(const struct ctx *c)
> -{
> - struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
> - struct iovec *iov;
> - int i;
> -
> - tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
> -
> - for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
> - tcp4_payload_ip[i] = iph;
> - tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> - tcp4_payload[i].th.ack = 1;
> - }
> -
> - for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
> - tcp4_flags_ip[i] = iph;
> - tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> - tcp4_flags[i].th.ack = 1;
> - }
> -
> - for (i = 0; i < TCP_FRAMES_MEM; i++) {
> - iov = tcp4_l2_iov[i];
> -
> - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
> - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
> - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
> - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
> - }
> -
> - for (i = 0; i < TCP_FRAMES_MEM; i++) {
> - iov = tcp4_l2_flags_iov[i];
> -
> - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
> - iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
> - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
> - iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
> - }
> -}
> -
> -/**
> - * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
> - * @c: Execution context
> - */
> -static void tcp_sock6_iov_init(const struct ctx *c)
> -{
> - struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
> - struct iovec *iov;
> - int i;
> -
> - tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
> -
> - for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
> - tcp6_payload_ip[i] = ip6;
> - tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> - tcp6_payload[i].th.ack = 1;
> - }
> -
> - for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
> - tcp6_flags_ip[i] = ip6;
> - tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> - tcp6_flags[i].th .ack = 1;
> - }
> -
> - for (i = 0; i < TCP_FRAMES_MEM; i++) {
> - iov = tcp6_l2_iov[i];
> -
> - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
> - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
> - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
> - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
> - }
> -
> - for (i = 0; i < TCP_FRAMES_MEM; i++) {
> - iov = tcp6_l2_flags_iov[i];
> -
> - iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
> - iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
> - iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
> - iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
> - }
> -}
> -
> /**
> * tcp_opt_get() - Get option, and value if any, from TCP header
> * @opts: Pointer to start of TCP options in header
> @@ -1227,76 +979,6 @@ bool tcp_flow_defer(const struct tcp_tap_conn *conn)
> return true;
> }
>
> -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
> -#define tcp_rst(c, conn) \
> - do { \
> - flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
> - tcp_rst_do(c, conn); \
> - } while (0)
> -
> -/**
> - * tcp_flags_flush() - Send out buffers for segments with no data (flags)
> - * @c: Execution context
> - */
> -static void tcp_flags_flush(const struct ctx *c)
> -{
> - tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
> - tcp6_flags_used);
> - tcp6_flags_used = 0;
> -
> - tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
> - tcp4_flags_used);
> - tcp4_flags_used = 0;
> -}
> -
> -/**
> - * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
> - * @conns: Array of connection pointers corresponding to queued frames
> - * @frames: Two-dimensional array containing queued frames with sub-iovs
> - * @num_frames: Number of entries in the two arrays to be compared
> - */
> -static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS],
> - int num_frames)
> -{
> - int i;
> -
> - for (i = 0; i < num_frames; i++) {
> - const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
> - struct tcp_tap_conn *conn = conns[i];
> - uint32_t seq = ntohl(th->seq);
> -
> - if (SEQ_LE(conn->seq_to_tap, seq))
> - continue;
> -
> - conn->seq_to_tap = seq;
> - }
> -}
> -
> -/**
> - * tcp_payload_flush() - Send out buffers for segments with data
> - * @c: Execution context
> - */
> -static void tcp_payload_flush(const struct ctx *c)
> -{
> - size_t m;
> -
> - m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
> - tcp6_payload_used);
> - if (m != tcp6_payload_used) {
> - tcp_revert_seq(&tcp6_frame_conns[m], &tcp6_l2_iov[m],
> - tcp6_payload_used - m);
> - }
> - tcp6_payload_used = 0;
> -
> - m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
> - tcp4_payload_used);
> - if (m != tcp4_payload_used) {
> - tcp_revert_seq(&tcp4_frame_conns[m], &tcp4_l2_iov[m],
> - tcp4_payload_used - m);
> - }
> - tcp4_payload_used = 0;
> -}
> -
> /**
> * tcp_defer_handler() - Handler for TCP deferred tasks
> * @c: Execution context
> @@ -1430,10 +1112,10 @@ static size_t tcp_fill_headers6(const struct ctx *c,
> *
> * Return: IP payload length, host order
> */
> -static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
> - const struct tcp_tap_conn *conn,
> - struct iovec *iov, size_t dlen,
> - const uint16_t *check, uint32_t seq)
> +size_t tcp_l2_buf_fill_headers(const struct ctx *c,
> + const struct tcp_tap_conn *conn,
> + struct iovec *iov, size_t dlen,
> + const uint16_t *check, uint32_t seq)
> {
> const struct in_addr *a4 = inany_v4(&conn->faddr);
>
> @@ -1459,8 +1141,8 @@ static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
> *
> * Return: 1 if sequence or window were updated, 0 otherwise
> */
> -static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
> - int force_seq, struct tcp_info *tinfo)
> +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
> + int force_seq, struct tcp_info *tinfo)
> {
> uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
> uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
> @@ -1579,9 +1261,9 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
> * 0 if there is no flag to send
> * 1 otherwise
> */
> -static int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
> - int flags, struct tcphdr *th, char *data,
> - size_t *optlen)
> +int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
> + int flags, struct tcphdr *th, char *data,
> + size_t *optlen)
> {
> struct tcp_info tinfo = { 0 };
> socklen_t sl = sizeof(tinfo);
> @@ -1678,54 +1360,9 @@ static int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn,
> *
> * Return: negative error code on connection reset, 0 otherwise
> */
> -static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> +int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> {
> - struct tcp_flags_t *payload;
> - struct iovec *iov;
> - size_t optlen;
> - size_t l4len;
> - int ret;
> -
> - if (CONN_V4(conn))
> - iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> - else
> - iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> -
> - payload = iov[TCP_IOV_PAYLOAD].iov_base;
> -
> - ret = tcp_prepare_flags(c, conn, flags, &payload->th,
> - payload->opts, &optlen);
> - if (ret <= 0)
> - return ret;
> -
> - l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
> - conn->seq_to_tap);
> - iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> -
> - if (flags & DUP_ACK) {
> - struct iovec *dup_iov;
> - int i;
> -
> - if (CONN_V4(conn))
> - dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> - else
> - dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> -
> - for (i = 0; i < TCP_NUM_IOVS; i++)
> - memcpy(dup_iov[i].iov_base, iov[i].iov_base,
> - iov[i].iov_len);
> - dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
> - }
> -
> - if (CONN_V4(conn)) {
> - if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
> - tcp_flags_flush(c);
> - } else {
> - if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
> - tcp_flags_flush(c);
> - }
> -
> - return 0;
> + return tcp_buf_send_flag(c, conn, flags);
> }
>
> /**
> @@ -1733,7 +1370,7 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> * @c: Execution context
> * @conn: Connection pointer
> */
> -static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
> +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
> {
> if (conn->events == CLOSED)
> return;
> @@ -2160,49 +1797,6 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
> return 0;
> }
>
> -/**
> - * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
> - * @c: Execution context
> - * @conn: Connection pointer
> - * @dlen: TCP payload length
> - * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
> - * @seq: Sequence number to be sent
> - */
> -static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> - ssize_t dlen, int no_csum, uint32_t seq)
> -{
> - struct iovec *iov;
> - size_t l4len;
> -
> - conn->seq_to_tap = seq + dlen;
> -
> - if (CONN_V4(conn)) {
> - struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
> - const uint16_t *check = NULL;
> -
> - if (no_csum) {
> - struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
> - check = &iph->check;
> - }
> -
> - tcp4_frame_conns[tcp4_payload_used] = conn;
> -
> - iov = tcp4_l2_iov[tcp4_payload_used++];
> - l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
> - iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> - if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
> - tcp_payload_flush(c);
> - } else if (CONN_V6(conn)) {
> - tcp6_frame_conns[tcp6_payload_used] = conn;
> -
> - iov = tcp6_l2_iov[tcp6_payload_used++];
> - l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
> - iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> - if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
> - tcp_payload_flush(c);
> - }
> -}
> -
> /**
> * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
> * @c: Execution context
> @@ -2214,123 +1808,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> */
> static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> {
> - uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
> - int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
> - int sendlen, len, dlen, v4 = CONN_V4(conn);
> - int s = conn->sock, i, ret = 0;
> - struct msghdr mh_sock = { 0 };
> - uint16_t mss = MSS_GET(conn);
> - uint32_t already_sent, seq;
> - struct iovec *iov;
> -
> - already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
> -
> - if (SEQ_LT(already_sent, 0)) {
> - /* RFC 761, section 2.1. */
> - flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
> - conn->seq_ack_from_tap, conn->seq_to_tap);
> - conn->seq_to_tap = conn->seq_ack_from_tap;
> - already_sent = 0;
> - }
> -
> - if (!wnd_scaled || already_sent >= wnd_scaled) {
> - conn_flag(c, conn, STALLED);
> - conn_flag(c, conn, ACK_FROM_TAP_DUE);
> - return 0;
> - }
> -
> - /* Set up buffer descriptors we'll fill completely and partially. */
> - fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
> - if (fill_bufs > TCP_FRAMES) {
> - fill_bufs = TCP_FRAMES;
> - iov_rem = 0;
> - } else {
> - iov_rem = (wnd_scaled - already_sent) % mss;
> - }
> -
> - mh_sock.msg_iov = iov_sock;
> - mh_sock.msg_iovlen = fill_bufs + 1;
> -
> - iov_sock[0].iov_base = tcp_buf_discard;
> - iov_sock[0].iov_len = already_sent;
> -
> - if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
> - (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
> - tcp_payload_flush(c);
> -
> - /* Silence Coverity CWE-125 false positive */
> - tcp4_payload_used = tcp6_payload_used = 0;
> - }
> -
> - for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
> - if (v4)
> - iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
> - else
> - iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
> - iov->iov_len = mss;
> - }
> - if (iov_rem)
> - iov_sock[fill_bufs].iov_len = iov_rem;
> -
> - /* Receive into buffers, don't dequeue until acknowledged by guest. */
> - do
> - len = recvmsg(s, &mh_sock, MSG_PEEK);
> - while (len < 0 && errno == EINTR);
> -
> - if (len < 0)
> - goto err;
> -
> - if (!len) {
> - if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
> - if ((ret = tcp_send_flag(c, conn, FIN | ACK))) {
> - tcp_rst(c, conn);
> - return ret;
> - }
> -
> - conn_event(c, conn, TAP_FIN_SENT);
> - }
> -
> - return 0;
> - }
> -
> - sendlen = len - already_sent;
> - if (sendlen <= 0) {
> - conn_flag(c, conn, STALLED);
> - return 0;
> - }
> -
> - conn_flag(c, conn, ~STALLED);
> -
> - send_bufs = DIV_ROUND_UP(sendlen, mss);
> - last_len = sendlen - (send_bufs - 1) * mss;
> -
> - /* Likely, some new data was acked too. */
> - tcp_update_seqack_wnd(c, conn, 0, NULL);
> -
> - /* Finally, queue to tap */
> - dlen = mss;
> - seq = conn->seq_to_tap;
> - for (i = 0; i < send_bufs; i++) {
> - int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
> -
> - if (i == send_bufs - 1)
> - dlen = last_len;
> -
> - tcp_data_to_tap(c, conn, dlen, no_csum, seq);
> - seq += dlen;
> - }
> -
> - conn_flag(c, conn, ACK_FROM_TAP_DUE);
> -
> - return 0;
> -
> -err:
> - if (errno != EAGAIN && errno != EWOULDBLOCK) {
> - ret = -errno;
> - tcp_rst(c, conn);
> - }
> -
> - return ret;
> + return tcp_buf_data_from_sock(c, conn);
> }
>
> /**
> diff --git a/tcp_buf.c b/tcp_buf.c
> new file mode 100644
> index 000000000000..0c7d07b8d0bd
> --- /dev/null
> +++ b/tcp_buf.c
> @@ -0,0 +1,513 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +/* PASST - Plug A Simple Socket Transport
> + * for qemu/UNIX domain socket mode
> + *
> + * PASTA - Pack A Subtle Tap Abstraction
> + * for network namespace/tap device mode
> + *
> + * tcp_buf.c - TCP L2-L4 buffer management functions
This still has "L2-L4", but L4 doesn't really make sense I think. I can
also drop "-L4" on merge.
> + *
> + * Copyright Red Hat
> + * Author: Stefano Brivio <sbrivio@redhat.com>
> + */
> +
> +#include <stddef.h>
> +#include <stdint.h>
> +#include <limits.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <netinet/ip.h>
> +
> +#include <linux/tcp.h>
> +
> +#include "util.h"
> +#include "ip.h"
> +#include "iov.h"
> +#include "passt.h"
> +#include "tap.h"
> +#include "siphash.h"
> +#include "inany.h"
> +#include "tcp_conn.h"
> +#include "tcp_internal.h"
> +#include "tcp_buf.h"
> +
> +#define TCP_FRAMES_MEM 128
> +#define TCP_FRAMES \
> + (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
> +
> +/* Static buffers */
> +/**
> + * struct tcp_payload_t - TCP header and data to send segments with payload
> + * @th: TCP header
> + * @data: TCP data
> + */
> +struct tcp_payload_t {
> + struct tcphdr th;
> + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
> +#ifdef __AVX2__
> +} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
> +#else
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> +#endif
> +
> +/**
> + * struct tcp_flags_t - TCP header and data to send zero-length
> + * segments (flags)
> + * @th: TCP header
> + * @opts TCP options
> + */
> +struct tcp_flags_t {
> + struct tcphdr th;
> + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
> +#ifdef __AVX2__
> +} __attribute__ ((packed, aligned(32)));
> +#else
> +} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
> +#endif
> +
> +/* Ethernet header for IPv4 frames */
> +static struct ethhdr tcp4_eth_src;
> +
> +static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
> +/* IPv4 headers */
> +static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
> +/* TCP segments with payload for IPv4 frames */
> +static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
> +
> +static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
> +
> +/* References tracking the owner connection of frames in the tap outqueue */
> +static struct tcp_tap_conn *tcp4_frame_conns[TCP_FRAMES_MEM];
> +static unsigned int tcp4_payload_used;
> +
> +static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
> +/* IPv4 headers for TCP segment without payload */
> +static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
> +/* TCP segments without payload for IPv4 frames */
> +static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
> +
> +static unsigned int tcp4_flags_used;
> +
> +/* Ethernet header for IPv6 frames */
> +static struct ethhdr tcp6_eth_src;
> +
> +static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
> +/* IPv6 headers */
> +static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
> +/* TCP headers and data for IPv6 frames */
> +static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
> +
> +static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
> +
> +/* References tracking the owner connection of frames in the tap outqueue */
> +static struct tcp_tap_conn *tcp6_frame_conns[TCP_FRAMES_MEM];
> +static unsigned int tcp6_payload_used;
> +
> +static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
> +/* IPv6 headers for TCP segment without payload */
> +static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
> +/* TCP segment without payload for IPv6 frames */
> +static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
> +
> +static unsigned int tcp6_flags_used;
> +
> +/* recvmsg()/sendmsg() data for tap */
> +static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
> +
> +static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
> +/**
> + * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
> + * @eth_d: Ethernet destination address, NULL if unchanged
> + * @eth_s: Ethernet source address, NULL if unchanged
> + */
> +void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> +{
> + eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
> + eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
> +}
> +
> +/**
> + * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
> + * @c: Execution context
> + */
> +void tcp_sock4_iov_init(const struct ctx *c)
> +{
> + struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
> + struct iovec *iov;
> + int i;
> +
> + tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
> +
> + for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
> + tcp4_payload_ip[i] = iph;
> + tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> + tcp4_payload[i].th.ack = 1;
> + }
> +
> + for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
> + tcp4_flags_ip[i] = iph;
> + tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> + tcp4_flags[i].th.ack = 1;
> + }
> +
> + for (i = 0; i < TCP_FRAMES_MEM; i++) {
> + iov = tcp4_l2_iov[i];
> +
> + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
> + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
> + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
> + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
> + }
> +
> + for (i = 0; i < TCP_FRAMES_MEM; i++) {
> + iov = tcp4_l2_flags_iov[i];
> +
> + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
> + iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
> + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
> + iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
> + }
> +}
> +
> +/**
> + * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
> + * @c: Execution context
> + */
> +void tcp_sock6_iov_init(const struct ctx *c)
> +{
> + struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
> + struct iovec *iov;
> + int i;
> +
> + tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
> +
> + for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
> + tcp6_payload_ip[i] = ip6;
> + tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
> + tcp6_payload[i].th.ack = 1;
> + }
> +
> + for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
> + tcp6_flags_ip[i] = ip6;
> + tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
> + tcp6_flags[i].th .ack = 1;
> + }
> +
> + for (i = 0; i < TCP_FRAMES_MEM; i++) {
> + iov = tcp6_l2_iov[i];
> +
> + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
> + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
> + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
> + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
> + }
> +
> + for (i = 0; i < TCP_FRAMES_MEM; i++) {
> + iov = tcp6_l2_flags_iov[i];
> +
> + iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
> + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
> + iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
> + iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
> + }
> +}
> +
> +/**
> + * tcp_flags_flush() - Send out buffers for segments with no data (flags)
> + * @c: Execution context
> + */
> +void tcp_flags_flush(const struct ctx *c)
> +{
> + tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
> + tcp6_flags_used);
> + tcp6_flags_used = 0;
> +
> + tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
> + tcp4_flags_used);
> + tcp4_flags_used = 0;
> +}
> +
> +/**
> + * tcp_revert_seq() - Revert affected conn->seq_to_tap after failed transmission
> + * @conns: Array of connection pointers corresponding to queued frames
> + * @frames: Two-dimensional array containing queued frames with sub-iovs
> + * @num_frames: Number of entries in the two arrays to be compared
> + */
> +static void tcp_revert_seq(struct tcp_tap_conn **conns, struct iovec (*frames)[TCP_NUM_IOVS],
> + int num_frames)
> +{
> + int i;
> +
> + for (i = 0; i < num_frames; i++) {
> + const struct tcphdr *th = frames[i][TCP_IOV_PAYLOAD].iov_base;
> + struct tcp_tap_conn *conn = conns[i];
> + uint32_t seq = ntohl(th->seq);
> +
> + if (SEQ_LE(conn->seq_to_tap, seq))
> + continue;
> +
> + conn->seq_to_tap = seq;
> + }
> +}
> +
> +/**
> + * tcp_payload_flush() - Send out buffers for segments with data
> + * @c: Execution context
> + */
> +void tcp_payload_flush(const struct ctx *c)
> +{
> + size_t m;
> +
> + m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
> + tcp6_payload_used);
> + if (m != tcp6_payload_used) {
> + tcp_revert_seq(&tcp6_frame_conns[m], &tcp6_l2_iov[m],
> + tcp6_payload_used - m);
> + }
> + tcp6_payload_used = 0;
> +
> + m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
> + tcp4_payload_used);
> + if (m != tcp4_payload_used) {
> + tcp_revert_seq(&tcp4_frame_conns[m], &tcp4_l2_iov[m],
> + tcp4_payload_used - m);
> + }
> + tcp4_payload_used = 0;
> +}
> +
> +/**
> + * tcp_buf_send_flag() - Send segment with flags to tap (no payload)
> + * @c: Execution context
> + * @conn: Connection pointer
> + * @flags: TCP flags: if not set, send segment only if ACK is due
> + *
> + * Return: negative error code on connection reset, 0 otherwise
> + */
> +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> +{
> + struct tcp_flags_t *payload;
> + struct iovec *iov;
> + size_t optlen;
> + size_t l4len;
> + int ret;
> +
> + if (CONN_V4(conn))
> + iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> + else
> + iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> +
> + payload = iov[TCP_IOV_PAYLOAD].iov_base;
> +
> + ret = tcp_prepare_flags(c, conn, flags, &payload->th,
> + payload->opts, &optlen);
> + if (ret <= 0)
> + return ret;
> +
> + l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
> + conn->seq_to_tap);
> + iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> +
> + if (flags & DUP_ACK) {
> + struct iovec *dup_iov;
> + int i;
> +
> + if (CONN_V4(conn))
> + dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
> + else
> + dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
> +
> + for (i = 0; i < TCP_NUM_IOVS; i++)
> + memcpy(dup_iov[i].iov_base, iov[i].iov_base,
> + iov[i].iov_len);
> + dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
> + }
> +
> + if (CONN_V4(conn)) {
> + if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
> + tcp_flags_flush(c);
> + } else {
> + if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
> + tcp_flags_flush(c);
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
> + * @c: Execution context
> + * @conn: Connection pointer
> + * @dlen: TCP payload length
> + * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
> + * @seq: Sequence number to be sent
> + */
> +static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> + ssize_t dlen, int no_csum, uint32_t seq)
> +{
> + struct iovec *iov;
> + size_t l4len;
> +
> + conn->seq_to_tap = seq + dlen;
> +
> + if (CONN_V4(conn)) {
> + struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
> + const uint16_t *check = NULL;
> +
> + if (no_csum) {
> + struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
> + check = &iph->check;
> + }
> +
> + tcp4_frame_conns[tcp4_payload_used] = conn;
> +
> + iov = tcp4_l2_iov[tcp4_payload_used++];
> + l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
> + iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> + if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
> + tcp_payload_flush(c);
> + } else if (CONN_V6(conn)) {
> + tcp6_frame_conns[tcp6_payload_used] = conn;
> +
> + iov = tcp6_l2_iov[tcp6_payload_used++];
> + l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
> + iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> + if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
> + tcp_payload_flush(c);
> + }
> +}
> +
> +/**
> + * tcp_buf_data_from_sock() - Handle new data from socket, queue to tap, in window
> + * @c: Execution context
> + * @conn: Connection pointer
> + *
> + * Return: negative on connection reset, 0 otherwise
> + *
> + * #syscalls recvmsg
> + */
> +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> +{
> + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
> + int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
> + int sendlen, len, dlen, v4 = CONN_V4(conn);
> + int s = conn->sock, i, ret = 0;
> + struct msghdr mh_sock = { 0 };
> + uint16_t mss = MSS_GET(conn);
> + uint32_t already_sent, seq;
> + struct iovec *iov;
> +
> + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
> +
> + if (SEQ_LT(already_sent, 0)) {
> + /* RFC 761, section 2.1. */
> + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
> + conn->seq_ack_from_tap, conn->seq_to_tap);
> + conn->seq_to_tap = conn->seq_ack_from_tap;
> + already_sent = 0;
> + }
> +
> + if (!wnd_scaled || already_sent >= wnd_scaled) {
> + conn_flag(c, conn, STALLED);
> + conn_flag(c, conn, ACK_FROM_TAP_DUE);
> + return 0;
> + }
> +
> + /* Set up buffer descriptors we'll fill completely and partially. */
> + fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
> + if (fill_bufs > TCP_FRAMES) {
> + fill_bufs = TCP_FRAMES;
> + iov_rem = 0;
> + } else {
> + iov_rem = (wnd_scaled - already_sent) % mss;
> + }
> +
> + mh_sock.msg_iov = iov_sock;
> + mh_sock.msg_iovlen = fill_bufs + 1;
> +
> + iov_sock[0].iov_base = tcp_buf_discard;
> + iov_sock[0].iov_len = already_sent;
> +
> + if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
> + (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
> + tcp_payload_flush(c);
> +
> + /* Silence Coverity CWE-125 false positive */
> + tcp4_payload_used = tcp6_payload_used = 0;
> + }
> +
> + for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
> + if (v4)
> + iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
> + else
> + iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
> + iov->iov_len = mss;
> + }
> + if (iov_rem)
> + iov_sock[fill_bufs].iov_len = iov_rem;
> +
> + /* Receive into buffers, don't dequeue until acknowledged by guest. */
> + do
> + len = recvmsg(s, &mh_sock, MSG_PEEK);
> + while (len < 0 && errno == EINTR);
> +
> + if (len < 0)
> + goto err;
> +
> + if (!len) {
> + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
> + if ((ret = tcp_buf_send_flag(c, conn, FIN | ACK))) {
> + tcp_rst(c, conn);
> + return ret;
> + }
> +
> + conn_event(c, conn, TAP_FIN_SENT);
> + }
> +
> + return 0;
> + }
> +
> + sendlen = len - already_sent;
> + if (sendlen <= 0) {
> + conn_flag(c, conn, STALLED);
> + return 0;
> + }
> +
> + conn_flag(c, conn, ~STALLED);
> +
> + send_bufs = DIV_ROUND_UP(sendlen, mss);
> + last_len = sendlen - (send_bufs - 1) * mss;
> +
> + /* Likely, some new data was acked too. */
> + tcp_update_seqack_wnd(c, conn, 0, NULL);
> +
> + /* Finally, queue to tap */
> + dlen = mss;
> + seq = conn->seq_to_tap;
> + for (i = 0; i < send_bufs; i++) {
> + int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
> +
> + if (i == send_bufs - 1)
> + dlen = last_len;
> +
> + tcp_data_to_tap(c, conn, dlen, no_csum, seq);
> + seq += dlen;
> + }
> +
> + conn_flag(c, conn, ACK_FROM_TAP_DUE);
> +
> + return 0;
> +
> +err:
> + if (errno != EAGAIN && errno != EWOULDBLOCK) {
> + ret = -errno;
> + tcp_rst(c, conn);
> + }
> +
> + return ret;
> +}
> diff --git a/tcp_buf.h b/tcp_buf.h
> new file mode 100644
> index 000000000000..14be7b945285
> --- /dev/null
> +++ b/tcp_buf.h
> @@ -0,0 +1,16 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + * Copyright (c) 2021 Red Hat GmbH
> + * Author: Stefano Brivio <sbrivio@redhat.com>
> + */
> +
> +#ifndef TCP_BUF_H
> +#define TCP_BUF_H
> +
> +void tcp_sock4_iov_init(const struct ctx *c);
> +void tcp_sock6_iov_init(const struct ctx *c);
> +void tcp_flags_flush(const struct ctx *c);
> +void tcp_payload_flush(const struct ctx *c);
> +int tcp_buf_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
> +int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
> +
> +#endif /*TCP_BUF_H */
> diff --git a/tcp_internal.h b/tcp_internal.h
> new file mode 100644
> index 000000000000..51aaa16918cf
> --- /dev/null
> +++ b/tcp_internal.h
> @@ -0,0 +1,96 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later
> + * Copyright (c) 2021 Red Hat GmbH
> + * Author: Stefano Brivio <sbrivio@redhat.com>
> + */
> +
> +#ifndef TCP_INTERNAL_H
> +#define TCP_INTERNAL_H
> +
> +#define MAX_WS 8
> +#define MAX_WINDOW (1 << (16 + (MAX_WS)))
> +
> +#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
> + sizeof(struct tcphdr) - \
> + sizeof(struct iphdr), \
> + sizeof(uint32_t))
> +#define MSS6 ROUND_DOWN(IP_MAX_MTU - \
> + sizeof(struct tcphdr) - \
> + sizeof(struct ipv6hdr), \
> + sizeof(uint32_t))
> +
> +#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
> +#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
> +#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
> +#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
> +
> +#define FIN (1 << 0)
> +#define SYN (1 << 1)
> +#define RST (1 << 2)
> +#define ACK (1 << 4)
> +
> +/* Flags for internal usage */
> +#define DUP_ACK (1 << 5)
> +#define OPT_EOL 0
> +#define OPT_NOP 1
> +#define OPT_MSS 2
> +#define OPT_MSS_LEN 4
> +#define OPT_WS 3
> +#define OPT_WS_LEN 3
> +#define OPT_SACKP 4
> +#define OPT_SACK 5
> +#define OPT_TS 8
> +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
> +#define CONN_V6(conn) (!CONN_V4(conn))
> +
> +/*
> + * enum tcp_iov_parts - I/O vector parts for one TCP frame
> + * @TCP_IOV_TAP tap backend specific header
> + * @TCP_IOV_ETH Ethernet header
> + * @TCP_IOV_IP IP (v4/v6) header
> + * @TCP_IOV_PAYLOAD IP payload (TCP header + data)
> + * @TCP_NUM_IOVS the number of entries in the iovec array
> + */
> +enum tcp_iov_parts {
> + TCP_IOV_TAP = 0,
> + TCP_IOV_ETH = 1,
> + TCP_IOV_IP = 2,
> + TCP_IOV_PAYLOAD = 3,
> + TCP_NUM_IOVS
> +};
> +
> +extern char tcp_buf_discard [MAX_WINDOW];
> +
> +void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
> + unsigned long flag);
> +#define conn_flag(c, conn, flag) \
> + do { \
> + flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
> + conn_flag_do(c, conn, flag); \
> + } while (0)
> +
> +
> +void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
> + unsigned long event);
> +#define conn_event(c, conn, event) \
> + do { \
> + flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
> + conn_event_do(c, conn, event); \
> + } while (0)
> +
> +void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
> +#define tcp_rst(c, conn) \
> + do { \
> + flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
> + tcp_rst_do(c, conn); \
> + } while (0)
> +
> +size_t tcp_l2_buf_fill_headers(const struct ctx *c,
> + const struct tcp_tap_conn *conn,
> + struct iovec *iov, size_t dlen,
> + const uint16_t *check, uint32_t seq);
> +int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
> + int force_seq, struct tcp_info *tinfo);
> +int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags,
> + struct tcphdr *th, char *data, size_t *optlen);
> +
> +#endif /* TCP_INTERNAL_H */
--
Stefano
next prev parent reply other threads:[~2024-06-12 15:54 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-06-12 15:47 [PATCH v6 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 1/8] tcp: extract buffer management from tcp_send_flag() Laurent Vivier
2024-06-12 21:22 ` Stefano Brivio
2024-06-13 6:07 ` Stefano Brivio
2024-06-13 8:24 ` Laurent Vivier
2024-06-13 10:14 ` Stefano Brivio
2024-06-13 10:22 ` Laurent Vivier
2024-06-13 10:49 ` Stefano Brivio
2024-06-13 10:58 ` Laurent Vivier
2024-06-13 7:31 ` Laurent Vivier
2024-06-13 9:35 ` Stefano Brivio
2024-06-13 14:36 ` David Gibson
2024-06-12 15:47 ` [PATCH v6 2/8] tcp: move buffers management functions to their own file Laurent Vivier
2024-06-12 15:54 ` Stefano Brivio [this message]
2024-06-12 15:47 ` [PATCH v6 3/8] tap: refactor packets handling functions Laurent Vivier
2024-06-12 15:52 ` Stefano Brivio
2024-06-12 16:00 ` Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 4/8] udp: refactor UDP header update functions Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 5/8] udp: rename udp_sock_handler() to udp_buf_sock_handler() Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 6/8] vhost-user: compare mode MODE_PASTA and not MODE_PASST Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 7/8] iov: remove iov_copy() Laurent Vivier
2024-06-12 15:47 ` [PATCH v6 8/8] tap: use in->buf_size rather than sizeof(pkt_buf) Laurent Vivier
2024-06-12 17:16 ` [PATCH v6 0/8] Add vhost-user support to passt (part 2) Stefano Brivio
2024-06-12 17:37 ` Stefano Brivio
2024-06-12 21:23 ` Stefano Brivio
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240612175405.1711bc90@elisabeth \
--to=sbrivio@redhat.com \
--cc=lvivier@redhat.com \
--cc=passt-dev@passt.top \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).