On Fri, Feb 02, 2024 at 03:11:49PM +0100, Laurent Vivier wrote:
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  Makefile |   6 +-
>  tcp.c    |  66 +++++---
>  tcp_vu.c | 447 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tcp_vu.h |  10 ++
>  4 files changed, 502 insertions(+), 27 deletions(-)
>  create mode 100644 tcp_vu.c
>  create mode 100644 tcp_vu.h
> 
> diff --git a/Makefile b/Makefile
> index 2016b071ddf2..f7a403d19b61 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
>  PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c icmp.c \
>  	igmp.c isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c \
>  	passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c \
> -	tcp_buf.c udp.c util.c iov.c ip.c virtio.c vhost_user.c
> +	tcp_buf.c tcp_vu.c udp.c util.c iov.c ip.c virtio.c vhost_user.c
>  QRAP_SRCS = qrap.c
>  SRCS = $(PASST_SRCS) $(QRAP_SRCS)
>  
> @@ -56,8 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
>  PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h \
>  	flow_table.h icmp.h inany.h isolation.h lineread.h log.h ndp.h \
>  	netlink.h packet.h passt.h pasta.h pcap.h pif.h port_fwd.h siphash.h \
> -	tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_internal.h udp.h \
> -	util.h iov.h ip.h virtio.h vhost_user.h
> +	tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_vu.h tcp_internal.h \
> +	udp.h util.h iov.h ip.h virtio.h vhost_user.h
>  HEADERS = $(PASST_HEADERS) seccomp.h
>  
>  C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
> diff --git a/tcp.c b/tcp.c
> index b6aca9f37f19..e829e12fe7c2 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -302,6 +302,7 @@
>  #include "flow_table.h"
>  #include "tcp_internal.h"
>  #include "tcp_buf.h"
> +#include "tcp_vu.h"
>  
>  /* Sides of a flow as we use them in "tap" connections */
>  #define	SOCKSIDE	0
> @@ -1034,7 +1035,7 @@ size_t ipv4_fill_headers(const struct ctx *c,
>  	tcp_set_tcp_header(th, conn, seq);
>  
>  	th->check = 0;
> -	if (c->mode != MODE_VU || *c->pcap)
> +	if (c->mode != MODE_VU)
>  		th->check = tcp_update_check_tcp4(iph);
>  
>  	return ip_len;
> @@ -1072,7 +1073,7 @@ size_t ipv6_fill_headers(const struct ctx *c,
>  	tcp_set_tcp_header(th, conn, seq);
>  
>  	th->check = 0;
> -	if (c->mode != MODE_VU || *c->pcap)
> +	if (c->mode != MODE_VU)
>  		th->check = tcp_update_check_tcp6(ip6h);
>  
>  	ip6h->hop_limit = 255;
> @@ -1302,6 +1303,12 @@ int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags,
>  	return 1;
>  }
>  
> +int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> +{
> +	if (c->mode == MODE_VU)
> +		return tcp_vu_send_flag(c, conn, flags);
> +	return tcp_buf_send_flag(c, conn, flags);

Your previous renames to "tcp_buf" make some more sense to me now.
It's not so much that the "tcp_buf" functions are related to buffer
management but they belong to the (linear, passt-managed) buffer
implementation of TCP.  I see the rationale, but I still don't really
like the name - I don't think that the connection from "tcp_buf" to,
"TCP code specific to several but not all L2 interface
implementations" is at all obvious.  Not that a good way of conveying
that quickly occurs to me.  For the time being, I'm inclined to just
stick with "tcp", or maybe "tcp_default" for the existing (tuntap &
qemu socket) implementations and tcp_vu for the new ones.  That can
maybe cleaned up with a more systematic division of L2 interface types
(on my list...).

> +}
>  
>  /**
>   * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
> @@ -1313,7 +1320,7 @@ void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
>  	if (conn->events == CLOSED)
>  		return;
>  
> -	if (!tcp_buf_send_flag(c, conn, RST))
> +	if (!tcp_send_flag(c, conn, RST))
>  		conn_event(c, conn, CLOSED);
>  }
>  
> @@ -1430,7 +1437,8 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
>   *
>   * Return: clamped MSS value
>   */
> -static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
> +static uint16_t tcp_conn_tap_mss(const struct ctx *c,
> +				 const struct tcp_tap_conn *conn,
>  				 const char *opts, size_t optlen)
>  {
>  	unsigned int mss;
> @@ -1441,7 +1449,10 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
>  	else
>  		mss = ret;
>  
> -	mss = MIN(tcp_buf_conn_tap_mss(conn), mss);
> +	if (c->mode == MODE_VU)
> +		mss = MIN(tcp_vu_conn_tap_mss(conn), mss);
> +	else
> +		mss = MIN(tcp_buf_conn_tap_mss(conn), mss);

This seems oddly complex.  What are the actual circumstances in which
the VU mss would differ from other cases?

>  	return MIN(mss, USHRT_MAX);
>  }
> @@ -1568,7 +1579,7 @@ static void tcp_conn_from_tap(struct ctx *c,
>  
>  	conn->wnd_to_tap = WINDOW_DEFAULT;
>  
> -	mss = tcp_conn_tap_mss(conn, opts, optlen);
> +	mss = tcp_conn_tap_mss(c, conn, opts, optlen);
>  	if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)))
>  		flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s);
>  	MSS_SET(conn, mss);
> @@ -1625,7 +1636,7 @@ static void tcp_conn_from_tap(struct ctx *c,
>  	} else {
>  		tcp_get_sndbuf(conn);
>  
> -		if (tcp_buf_send_flag(c, conn, SYN | ACK))
> +		if (tcp_send_flag(c, conn, SYN | ACK))
>  			return;
>  
>  		conn_event(c, conn, TAP_SYN_ACK_SENT);
> @@ -1673,6 +1684,13 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
>  	return 0;
>  }
>  
> +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> +{
> +	if (c->mode == MODE_VU)
> +		return tcp_vu_data_from_sock(c, conn);
> +
> +	return tcp_buf_data_from_sock(c, conn);
> +}
>  
>  /**
>   * tcp_data_from_tap() - tap/guest data for established connection
> @@ -1806,7 +1824,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
>  			   max_ack_seq, conn->seq_to_tap);
>  		conn->seq_ack_from_tap = max_ack_seq;
>  		conn->seq_to_tap = max_ack_seq;
> -		tcp_buf_data_from_sock(c, conn);
> +		tcp_data_from_sock(c, conn);

In particular having changed all these calls from tcp_ to tcp_buf_ and
now changing them back seems like churn that it would be nice to
avoid.

>  	}
>  
>  	if (!iov_i)
> @@ -1822,14 +1840,14 @@ eintr:
>  			 *   Then swiftly looked away and left.
>  			 */
>  			conn->seq_from_tap = seq_from_tap;
> -			tcp_buf_send_flag(c, conn, ACK);
> +			tcp_send_flag(c, conn, ACK);
>  		}
>  
>  		if (errno == EINTR)
>  			goto eintr;
>  
>  		if (errno == EAGAIN || errno == EWOULDBLOCK) {
> -			tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
> +			tcp_send_flag(c, conn, ACK_IF_NEEDED);
>  			return p->count - idx;
>  
>  		}
> @@ -1839,7 +1857,7 @@ eintr:
>  	if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
>  		partial_send = 1;
>  		conn->seq_from_tap += n;
> -		tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
> +		tcp_send_flag(c, conn, ACK_IF_NEEDED);
>  	} else {
>  		conn->seq_from_tap += n;
>  	}
> @@ -1852,7 +1870,7 @@ out:
>  		 */
>  		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
>  			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
> -			tcp_buf_send_flag(c, conn, DUP_ACK);
> +			tcp_send_flag(c, conn, DUP_ACK);
>  		}
>  		return p->count - idx;
>  	}
> @@ -1866,7 +1884,7 @@ out:
>  
>  		conn_event(c, conn, TAP_FIN_RCVD);
>  	} else {
> -		tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
> +		tcp_send_flag(c, conn, ACK_IF_NEEDED);
>  	}
>  
>  	return p->count - idx;
> @@ -1891,7 +1909,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
>  	if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
>  		conn->wnd_from_tap = 1;
>  
> -	MSS_SET(conn, tcp_conn_tap_mss(conn, opts, optlen));
> +	MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
>  
>  	conn->seq_init_from_tap = ntohl(th->seq) + 1;
>  	conn->seq_from_tap = conn->seq_init_from_tap;
> @@ -1902,8 +1920,8 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
>  	/* The client might have sent data already, which we didn't
>  	 * dequeue waiting for SYN,ACK from tap -- check now.
>  	 */
> -	tcp_buf_data_from_sock(c, conn);
> -	tcp_buf_send_flag(c, conn, ACK);
> +	tcp_data_from_sock(c, conn);
> +	tcp_send_flag(c, conn, ACK);
>  }
>  
>  /**
> @@ -1983,7 +2001,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
>  			conn->seq_from_tap++;
>  
>  			shutdown(conn->sock, SHUT_WR);
> -			tcp_buf_send_flag(c, conn, ACK);
> +			tcp_send_flag(c, conn, ACK);
>  			conn_event(c, conn, SOCK_FIN_SENT);
>  
>  			return 1;
> @@ -1994,7 +2012,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
>  
>  		tcp_tap_window_update(conn, ntohs(th->window));
>  
> -		tcp_buf_data_from_sock(c, conn);
> +		tcp_data_from_sock(c, conn);
>  
>  		if (p->count - idx == 1)
>  			return 1;
> @@ -2024,7 +2042,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
>  	if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
>  		shutdown(conn->sock, SHUT_WR);
>  		conn_event(c, conn, SOCK_FIN_SENT);
> -		tcp_buf_send_flag(c, conn, ACK);
> +		tcp_send_flag(c, conn, ACK);
>  		ack_due = 0;
>  	}
>  
> @@ -2058,7 +2076,7 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
>  		return;
>  	}
>  
> -	if (tcp_buf_send_flag(c, conn, SYN | ACK))
> +	if (tcp_send_flag(c, conn, SYN | ACK))
>  		return;
>  
>  	conn_event(c, conn, TAP_SYN_ACK_SENT);
> @@ -2126,7 +2144,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c,
>  
>  	conn->wnd_from_tap = WINDOW_DEFAULT;
>  
> -	tcp_buf_send_flag(c, conn, SYN);
> +	tcp_send_flag(c, conn, SYN);
>  	conn_flag(c, conn, ACK_FROM_TAP_DUE);
>  
>  	tcp_get_sndbuf(conn);
> @@ -2190,7 +2208,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
>  		return;
>  
>  	if (conn->flags & ACK_TO_TAP_DUE) {
> -		tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
> +		tcp_send_flag(c, conn, ACK_IF_NEEDED);
>  		tcp_timer_ctl(c, conn);
>  	} else if (conn->flags & ACK_FROM_TAP_DUE) {
>  		if (!(conn->events & ESTABLISHED)) {
> @@ -2206,7 +2224,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
>  			flow_dbg(conn, "ACK timeout, retry");
>  			conn->retrans++;
>  			conn->seq_to_tap = conn->seq_ack_from_tap;
> -			tcp_buf_data_from_sock(c, conn);
> +			tcp_data_from_sock(c, conn);
>  			tcp_timer_ctl(c, conn);
>  		}
>  	} else {
> @@ -2261,7 +2279,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
>  			conn_event(c, conn, SOCK_FIN_RCVD);
>  
>  		if (events & EPOLLIN)
> -			tcp_buf_data_from_sock(c, conn);
> +			tcp_data_from_sock(c, conn);
>  
>  		if (events & EPOLLOUT)
>  			tcp_update_seqack_wnd(c, conn, 0, NULL);
> diff --git a/tcp_vu.c b/tcp_vu.c
> new file mode 100644
> index 000000000000..ed59b21cabdc
> --- /dev/null
> +++ b/tcp_vu.c
> @@ -0,0 +1,447 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later

Copyright notice.


> +#include <errno.h>
> +#include <stddef.h>
> +#include <stdint.h>
> +
> +#include <netinet/ip.h>
> +
> +#include <sys/socket.h>
> +
> +#include <linux/tcp.h>
> +#include <linux/virtio_net.h>
> +
> +#include "util.h"
> +#include "ip.h"
> +#include "passt.h"
> +#include "siphash.h"
> +#include "inany.h"
> +#include "vhost_user.h"
> +#include "tcp.h"
> +#include "pcap.h"
> +#include "flow.h"
> +#include "tcp_conn.h"
> +#include "flow_table.h"
> +#include "tcp_vu.h"
> +#include "tcp_internal.h"
> +#include "checksum.h"
> +
> +#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
> +#define CONN_V6(conn)		(!CONN_V4(conn))

I don't love having these duplicated in two .c files.  However, it
might become irrelevant as I move towards having v4/v6 become implicit
in the common flow addresses.

> +/* vhost-user */
> +static const struct virtio_net_hdr vu_header = {
> +	.flags = VIRTIO_NET_HDR_F_DATA_VALID,
> +	.gso_type = VIRTIO_NET_HDR_GSO_NONE,
> +};
> +
> +static unsigned char buffer[65536];
> +static struct iovec	iov_vu			[VIRTQUEUE_MAX_SIZE];
> +static unsigned int	indexes			[VIRTQUEUE_MAX_SIZE];
> +
> +uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn)
> +{
> +	(void)conn;
> +	return USHRT_MAX;
> +}
> +
> +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> +{
> +	VuDev *vdev = (VuDev *)&c->vdev;
> +	VuVirtqElement *elem;
> +	VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> +	struct virtio_net_hdr_mrg_rxbuf *vh;
> +	size_t tlen, vnet_hdrlen, ip_len, optlen = 0;
> +	struct ethhdr *eh;
> +	int ret;
> +	int nb_ack;
> +
> +	elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
> +	if (!elem)
> +		return 0;
> +
> +	if (elem->in_num < 1) {
> +		err("virtio-net receive queue contains no in buffers");
> +		vu_queue_rewind(vdev, vq, 1);
> +		return 0;
> +	}
> +
> +	/* Options: MSS, NOP and window scale (8 bytes) */
> +	if (flags & SYN)
> +		optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;

Given the number of subtle TCP bugs we've had to squash, it would be
really nice if we could avoid duplicating TCP logic between paths.
Could we make some abstraction that takes an iov, but can be also
called from the non-vu case with a 1-entry iov representing a single
buffer?

> +	vh = elem->in_sg[0].iov_base;
> +
> +	vh->hdr = vu_header;
> +	if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +		vh->num_buffers = htole16(1);
> +	} else {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr);
> +	}
> +	eh = (struct ethhdr *)((char *)elem->in_sg[0].iov_base + vnet_hdrlen);

Ah... hmm.. I already had hope to clean up handling different L2 and
below headers for the different "tap" types.  We basically have ugly
hacks to deal with the difference between tuntap (plain ethernet) and
qemu socket (ethernet + length header).  Now we're adding vhost-user
(ethernet + vhost header), which is a similar issue.  Abstracting this
could also make it pretty easy to support further "tap" interfaces: a
different hypervisor socket transfer with slightly different header,
tuntap in "tun" mode (raw IP without ethernet headers), SLIP or PPP,
...

> +	memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
> +	memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
> +
> +	if (CONN_V4(conn)) {
> +		struct iphdr *iph = (struct iphdr *)(eh + 1);
> +		struct tcphdr *th = (struct tcphdr *)(iph + 1);

Hmm.. did I miss logic to check that there's room for the vhost +
ethernet + IP  + TCP headers in the first iov element?

> +		char *data = (char *)(th + 1);
> +
> +		eh->h_proto = htons(ETH_P_IP);
> +
> +		*th = (struct tcphdr){
> +			.doff = sizeof(struct tcphdr) / 4,
> +			.ack = 1
> +		};
> +
> +		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
> +
> +		ret = do_tcp_send_flag(c, conn, flags, th, data, optlen);
> +		if (ret <= 0) {
> +			vu_queue_rewind(vdev, vq, 1);
> +			return ret;
> +		}
> +
> +		ip_len = ipv4_fill_headers(c, conn, iph, optlen, NULL,
> +					   conn->seq_to_tap);
> +
> +		tlen =  ip_len + sizeof(struct ethhdr);
> +
> +		if (*c->pcap) {
> +			uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP);
> +
> +			th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
> +		}
> +	} else {
> +		struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
> +		struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
> +		char *data = (char *)(th + 1);
> +
> +		eh->h_proto = htons(ETH_P_IPV6);
> +
> +		*th = (struct tcphdr){
> +			.doff = sizeof(struct tcphdr) / 4,
> +			.ack = 1
> +		};
> +
> +		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
> +
> +		ret = do_tcp_send_flag(c, conn, flags, th, data, optlen);
> +		if (ret <= 0) {
> +			vu_queue_rewind(vdev, vq, 1);
> +			return ret;
> +		}
> +
> +		ip_len = ipv6_fill_headers(c, conn, ip6h, optlen,
> +					   conn->seq_to_tap);
> +
> +		tlen =  ip_len + sizeof(struct ethhdr);
> +
> +		if (*c->pcap) {
> +			uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP);
> +
> +			th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
> +		}
> +	}
> +
> +	pcap((void *)eh, tlen);
> +
> +	tlen += vnet_hdrlen;
> +	vu_queue_fill(vdev, vq, elem, tlen, 0);
> +	nb_ack = 1;
> +
> +	if (flags & DUP_ACK) {
> +		elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
> +		if (elem) {
> +			if (elem->in_num < 1 || elem->in_sg[0].iov_len < tlen) {
> +				vu_queue_rewind(vdev, vq, 1);
> +			} else {
> +				memcpy(elem->in_sg[0].iov_base, vh, tlen);
> +				nb_ack++;
> +			}
> +		}
> +	}
> +
> +	vu_queue_flush(vdev, vq, nb_ack);
> +	vu_queue_notify(vdev, vq);
> +
> +	return 0;
> +}
> +
> +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> +{
> +	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
> +	uint32_t already_sent;
> +	VuDev *vdev = (VuDev *)&c->vdev;
> +	VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> +	int s = conn->sock, v4 = CONN_V4(conn);
> +	int i, ret = 0, iov_count, iov_used;
> +	struct msghdr mh_sock = { 0 };
> +	size_t l2_hdrlen, vnet_hdrlen, fillsize;
> +	ssize_t len;
> +	uint16_t *check;
> +	uint16_t mss = MSS_GET(conn);
> +	int num_buffers;
> +	int segment_size;
> +	struct iovec *first;
> +	bool has_mrg_rxbuf;
> +
> +	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
> +		err("Got packet, but no available descriptors on RX virtq.");
> +		return 0;
> +	}
> +
> +	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
> +
> +	if (SEQ_LT(already_sent, 0)) {
> +		/* RFC 761, section 2.1. */
> +		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
> +			   conn->seq_ack_from_tap, conn->seq_to_tap);
> +		conn->seq_to_tap = conn->seq_ack_from_tap;
> +		already_sent = 0;
> +	}
> +
> +	if (!wnd_scaled || already_sent >= wnd_scaled) {
> +		conn_flag(c, conn, STALLED);
> +		conn_flag(c, conn, ACK_FROM_TAP_DUE);
> +		return 0;
> +	}
> +
> +	/* Set up buffer descriptors we'll fill completely and partially. */
> +
> +	fillsize = wnd_scaled;
> +
> +	iov_vu[0].iov_base = tcp_buf_discard;
> +	iov_vu[0].iov_len = already_sent;
> +	fillsize -= already_sent;
> +
> +	has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
> +	if (has_mrg_rxbuf) {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +	} else {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr);
> +	}

passt style (unlike qemu) does not put braces on 1-line blocks.

> +	l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr);

That seems like a misleading variable name.  The ethernet headers are
certainly L2.  Including the lower level headers in L2 is reasonable,
but the IP and TCP headers are L3 and L4 headers respectively.

> +	if (v4) {
> +		l2_hdrlen += sizeof(struct iphdr);
> +	} else {
> +		l2_hdrlen += sizeof(struct ipv6hdr);
> +	}
> +
> +	iov_count = 0;
> +	segment_size = 0;
> +	while (fillsize > 0 && iov_count < VIRTQUEUE_MAX_SIZE - 1) {
> +		VuVirtqElement *elem;
> +
> +		elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
> +		if (!elem)
> +			break;
> +
> +		if (elem->in_num < 1) {
> +			err("virtio-net receive queue contains no in buffers");
> +			goto err;
> +		}
> +
> +		ASSERT(elem->in_num == 1);
> +		ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen);
> +
> +		indexes[iov_count] = elem->index;
> +
> +		if (segment_size == 0) {
> +			iov_vu[iov_count + 1].iov_base =
> +					(char *)elem->in_sg[0].iov_base + l2_hdrlen;
> +			iov_vu[iov_count + 1].iov_len =
> +					elem->in_sg[0].iov_len - l2_hdrlen;
> +		} else {
> +			iov_vu[iov_count + 1].iov_base = elem->in_sg[0].iov_base;
> +			iov_vu[iov_count + 1].iov_len = elem->in_sg[0].iov_len;
> +		}
> +
> +		if (iov_vu[iov_count + 1].iov_len > fillsize)
> +			 iov_vu[iov_count + 1].iov_len = fillsize;
> +
> +		segment_size += iov_vu[iov_count + 1].iov_len;
> +		if (!has_mrg_rxbuf) {
> +			segment_size = 0;
> +		} else if (segment_size >= mss) {
> +			iov_vu[iov_count + 1].iov_len -= segment_size - mss;
> +			segment_size = 0;
> +		}
> +		fillsize -= iov_vu[iov_count + 1].iov_len;
> +
> +		iov_count++;
> +	}
> +	if (iov_count == 0)
> +		return 0;
> +
> +	mh_sock.msg_iov = iov_vu;
> +	mh_sock.msg_iovlen = iov_count + 1;
> +
> +	do
> +		len = recvmsg(s, &mh_sock, MSG_PEEK);
> +	while (len < 0 && errno == EINTR);
> +
> +	if (len < 0)
> +		goto err;
> +
> +	if (!len) {
> +		vu_queue_rewind(vdev, vq, iov_count);
> +		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
> +			if ((ret = tcp_vu_send_flag(c, conn, FIN | ACK))) {
> +				tcp_rst(c, conn);
> +				return ret;
> +			}
> +
> +			conn_event(c, conn, TAP_FIN_SENT);
> +		}
> +
> +		return 0;
> +	}
> +
> +	len -= already_sent;
> +	if (len <= 0) {
> +		conn_flag(c, conn, STALLED);
> +		vu_queue_rewind(vdev, vq, iov_count);
> +		return 0;
> +	}
> +
> +	conn_flag(c, conn, ~STALLED);
> +
> +	/* Likely, some new data was acked too. */
> +	tcp_update_seqack_wnd(c, conn, 0, NULL);
> +
> +	/* initialize headers */
> +	iov_used = 0;
> +	num_buffers = 0;
> +	check = NULL;
> +	segment_size = 0;
> +	for (i = 0; i < iov_count && len; i++) {
> +
> +		if (segment_size == 0)
> +			first = &iov_vu[i + 1];
> +
> +		if (iov_vu[i + 1].iov_len > (size_t)len)
> +			iov_vu[i + 1].iov_len = len;
> +
> +		len -= iov_vu[i + 1].iov_len;
> +		iov_used++;
> +
> +		segment_size += iov_vu[i + 1].iov_len;
> +		num_buffers++;
> +
> +		if (segment_size >= mss || len == 0 ||
> +		    i + 1 == iov_count || !has_mrg_rxbuf) {
> +
> +			struct ethhdr *eh;
> +			struct virtio_net_hdr_mrg_rxbuf *vh;
> +			char *base = (char *)first->iov_base - l2_hdrlen;
> +			size_t size = first->iov_len + l2_hdrlen;
> +
> +			vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
> +
> +			vh->hdr = vu_header;
> +			if (has_mrg_rxbuf)
> +				vh->num_buffers = htole16(num_buffers);
> +
> +			eh = (struct ethhdr *)((char *)base + vnet_hdrlen);
> +
> +			memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
> +			memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
> +
> +			/* initialize header */
> +			if (v4) {
> +				struct iphdr *iph = (struct iphdr *)(eh + 1);
> +				struct tcphdr *th = (struct tcphdr *)(iph + 1);
> +
> +				eh->h_proto = htons(ETH_P_IP);
> +
> +				*th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
> +
> +				*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
> +
> +				ipv4_fill_headers(c, conn, iph, segment_size,
> +						len ? check : NULL, conn->seq_to_tap);
> +
> +				if (*c->pcap) {
> +					uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP);
> +
> +					first->iov_base = th;
> +					first->iov_len = size - l2_hdrlen + sizeof(*th);
> +
> +					th->check = csum_iov(first, num_buffers, sum);
> +				}
> +
> +				check = &iph->check;
> +			} else {
> +				struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
> +				struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
> +
> +				eh->h_proto = htons(ETH_P_IPV6);
> +
> +				*th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
> +
> +				*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
> +
> +				ipv6_fill_headers(c, conn, ip6h, segment_size,
> +						conn->seq_to_tap);
> +				if (*c->pcap) {
> +					uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP);
> +
> +					first->iov_base = th;
> +					first->iov_len = size - l2_hdrlen + sizeof(*th);
> +
> +					th->check = csum_iov(first, num_buffers, sum);
> +				}
> +			}
> +
> +			/* set iov for pcap logging */
> +			first->iov_base = eh;
> +			first->iov_len = size - vnet_hdrlen;
> +
> +			pcap_iov(first, num_buffers);
> +
> +			/* set iov_len for vu_queue_fill_by_index(); */
> +
> +			first->iov_base = base;
> +			first->iov_len = size;
> +
> +			conn->seq_to_tap += segment_size;
> +
> +			segment_size = 0;
> +			num_buffers = 0;
> +		}
> +	}
> +
> +	/* release unused buffers */
> +	vu_queue_rewind(vdev, vq, iov_count - iov_used);
> +
> +	/* send packets */
> +	for (i = 0; i < iov_used; i++) {
> +		vu_queue_fill_by_index(vdev, vq, indexes[i],
> +				       iov_vu[i + 1].iov_len, i);
> +	}
> +
> +	vu_queue_flush(vdev, vq, iov_used);
> +	vu_queue_notify(vdev, vq);
> +
> +	conn_flag(c, conn, ACK_FROM_TAP_DUE);
> +
> +	return 0;
> +err:
> +	vu_queue_rewind(vdev, vq, iov_count);
> +
> +	if (errno != EAGAIN && errno != EWOULDBLOCK) {
> +		ret = -errno;
> +		tcp_rst(c, conn);
> +	}
> +
> +	return ret;
> +}
> diff --git a/tcp_vu.h b/tcp_vu.h
> new file mode 100644
> index 000000000000..8045a6e3edb8
> --- /dev/null
> +++ b/tcp_vu.h
> @@ -0,0 +1,10 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#ifndef TCP_VU_H
> +#define TCP_VU_H
> +
> +uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn);
> +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
> +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
> +
> +#endif  /*TCP_VU_H */

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson