Re: [PATCH 22/24] tcp: vhost-user RX nocopy

public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed

From: David Gibson <david@gibson.dropbear.id.au>
To: Laurent Vivier <lvivier@redhat.com>
Cc: passt-dev@passt.top
Subject: Re: [PATCH 22/24] tcp: vhost-user RX nocopy
Date: Fri, 9 Feb 2024 15:57:51 +1100	[thread overview]
Message-ID: <ZcWwz80xHORGbY5F@zatzit> (raw)
In-Reply-To: <20240202141151.3762941-23-lvivier@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 25634 bytes --]

On Fri, Feb 02, 2024 at 03:11:49PM +0100, Laurent Vivier wrote:
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  Makefile |   6 +-
>  tcp.c    |  66 +++++---
>  tcp_vu.c | 447 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  tcp_vu.h |  10 ++
>  4 files changed, 502 insertions(+), 27 deletions(-)
>  create mode 100644 tcp_vu.c
>  create mode 100644 tcp_vu.h
> 
> diff --git a/Makefile b/Makefile
> index 2016b071ddf2..f7a403d19b61 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
>  PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c icmp.c \
>  	igmp.c isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c \
>  	passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c \
> -	tcp_buf.c udp.c util.c iov.c ip.c virtio.c vhost_user.c
> +	tcp_buf.c tcp_vu.c udp.c util.c iov.c ip.c virtio.c vhost_user.c
>  QRAP_SRCS = qrap.c
>  SRCS = $(PASST_SRCS) $(QRAP_SRCS)
>  
> @@ -56,8 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
>  PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h \
>  	flow_table.h icmp.h inany.h isolation.h lineread.h log.h ndp.h \
>  	netlink.h packet.h passt.h pasta.h pcap.h pif.h port_fwd.h siphash.h \
> -	tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_internal.h udp.h \
> -	util.h iov.h ip.h virtio.h vhost_user.h
> +	tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_vu.h tcp_internal.h \
> +	udp.h util.h iov.h ip.h virtio.h vhost_user.h
>  HEADERS = $(PASST_HEADERS) seccomp.h
>  
>  C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
> diff --git a/tcp.c b/tcp.c
> index b6aca9f37f19..e829e12fe7c2 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -302,6 +302,7 @@
>  #include "flow_table.h"
>  #include "tcp_internal.h"
>  #include "tcp_buf.h"
> +#include "tcp_vu.h"
>  
>  /* Sides of a flow as we use them in "tap" connections */
>  #define	SOCKSIDE	0
> @@ -1034,7 +1035,7 @@ size_t ipv4_fill_headers(const struct ctx *c,
>  	tcp_set_tcp_header(th, conn, seq);
>  
>  	th->check = 0;
> -	if (c->mode != MODE_VU || *c->pcap)
> +	if (c->mode != MODE_VU)
>  		th->check = tcp_update_check_tcp4(iph);
>  
>  	return ip_len;
> @@ -1072,7 +1073,7 @@ size_t ipv6_fill_headers(const struct ctx *c,
>  	tcp_set_tcp_header(th, conn, seq);
>  
>  	th->check = 0;
> -	if (c->mode != MODE_VU || *c->pcap)
> +	if (c->mode != MODE_VU)
>  		th->check = tcp_update_check_tcp6(ip6h);
>  
>  	ip6h->hop_limit = 255;
> @@ -1302,6 +1303,12 @@ int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags,
>  	return 1;
>  }
>  
> +int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> +{
> +	if (c->mode == MODE_VU)
> +		return tcp_vu_send_flag(c, conn, flags);
> +	return tcp_buf_send_flag(c, conn, flags);

Your previous renames to "tcp_buf" make some more sense to me now.
It's not so much that the "tcp_buf" functions are related to buffer
management but they belong to the (linear, passt-managed) buffer
implementation of TCP.  I see the rationale, but I still don't really
like the name - I don't think that the connection from "tcp_buf" to,
"TCP code specific to several but not all L2 interface
implementations" is at all obvious.  Not that a good way of conveying
that quickly occurs to me.  For the time being, I'm inclined to just
stick with "tcp", or maybe "tcp_default" for the existing (tuntap &
qemu socket) implementations and tcp_vu for the new ones.  That can
maybe cleaned up with a more systematic division of L2 interface types
(on my list...).

> +}
>  
>  /**
>   * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
> @@ -1313,7 +1320,7 @@ void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
>  	if (conn->events == CLOSED)
>  		return;
>  
> -	if (!tcp_buf_send_flag(c, conn, RST))
> +	if (!tcp_send_flag(c, conn, RST))
>  		conn_event(c, conn, CLOSED);
>  }
>  
> @@ -1430,7 +1437,8 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
>   *
>   * Return: clamped MSS value
>   */
> -static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
> +static uint16_t tcp_conn_tap_mss(const struct ctx *c,
> +				 const struct tcp_tap_conn *conn,
>  				 const char *opts, size_t optlen)
>  {
>  	unsigned int mss;
> @@ -1441,7 +1449,10 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
>  	else
>  		mss = ret;
>  
> -	mss = MIN(tcp_buf_conn_tap_mss(conn), mss);
> +	if (c->mode == MODE_VU)
> +		mss = MIN(tcp_vu_conn_tap_mss(conn), mss);
> +	else
> +		mss = MIN(tcp_buf_conn_tap_mss(conn), mss);

This seems oddly complex.  What are the actual circumstances in which
the VU mss would differ from other cases?

>  	return MIN(mss, USHRT_MAX);
>  }
> @@ -1568,7 +1579,7 @@ static void tcp_conn_from_tap(struct ctx *c,
>  
>  	conn->wnd_to_tap = WINDOW_DEFAULT;
>  
> -	mss = tcp_conn_tap_mss(conn, opts, optlen);
> +	mss = tcp_conn_tap_mss(c, conn, opts, optlen);
>  	if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)))
>  		flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s);
>  	MSS_SET(conn, mss);
> @@ -1625,7 +1636,7 @@ static void tcp_conn_from_tap(struct ctx *c,
>  	} else {
>  		tcp_get_sndbuf(conn);
>  
> -		if (tcp_buf_send_flag(c, conn, SYN | ACK))
> +		if (tcp_send_flag(c, conn, SYN | ACK))
>  			return;
>  
>  		conn_event(c, conn, TAP_SYN_ACK_SENT);
> @@ -1673,6 +1684,13 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
>  	return 0;
>  }
>  
> +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> +{
> +	if (c->mode == MODE_VU)
> +		return tcp_vu_data_from_sock(c, conn);
> +
> +	return tcp_buf_data_from_sock(c, conn);
> +}
>  
>  /**
>   * tcp_data_from_tap() - tap/guest data for established connection
> @@ -1806,7 +1824,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
>  			   max_ack_seq, conn->seq_to_tap);
>  		conn->seq_ack_from_tap = max_ack_seq;
>  		conn->seq_to_tap = max_ack_seq;
> -		tcp_buf_data_from_sock(c, conn);
> +		tcp_data_from_sock(c, conn);

In particular having changed all these calls from tcp_ to tcp_buf_ and
now changing them back seems like churn that it would be nice to
avoid.

>  	}
>  
>  	if (!iov_i)
> @@ -1822,14 +1840,14 @@ eintr:
>  			 *   Then swiftly looked away and left.
>  			 */
>  			conn->seq_from_tap = seq_from_tap;
> -			tcp_buf_send_flag(c, conn, ACK);
> +			tcp_send_flag(c, conn, ACK);
>  		}
>  
>  		if (errno == EINTR)
>  			goto eintr;
>  
>  		if (errno == EAGAIN || errno == EWOULDBLOCK) {
> -			tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
> +			tcp_send_flag(c, conn, ACK_IF_NEEDED);
>  			return p->count - idx;
>  
>  		}
> @@ -1839,7 +1857,7 @@ eintr:
>  	if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
>  		partial_send = 1;
>  		conn->seq_from_tap += n;
> -		tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
> +		tcp_send_flag(c, conn, ACK_IF_NEEDED);
>  	} else {
>  		conn->seq_from_tap += n;
>  	}
> @@ -1852,7 +1870,7 @@ out:
>  		 */
>  		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
>  			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
> -			tcp_buf_send_flag(c, conn, DUP_ACK);
> +			tcp_send_flag(c, conn, DUP_ACK);
>  		}
>  		return p->count - idx;
>  	}
> @@ -1866,7 +1884,7 @@ out:
>  
>  		conn_event(c, conn, TAP_FIN_RCVD);
>  	} else {
> -		tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
> +		tcp_send_flag(c, conn, ACK_IF_NEEDED);
>  	}
>  
>  	return p->count - idx;
> @@ -1891,7 +1909,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
>  	if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
>  		conn->wnd_from_tap = 1;
>  
> -	MSS_SET(conn, tcp_conn_tap_mss(conn, opts, optlen));
> +	MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
>  
>  	conn->seq_init_from_tap = ntohl(th->seq) + 1;
>  	conn->seq_from_tap = conn->seq_init_from_tap;
> @@ -1902,8 +1920,8 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
>  	/* The client might have sent data already, which we didn't
>  	 * dequeue waiting for SYN,ACK from tap -- check now.
>  	 */
> -	tcp_buf_data_from_sock(c, conn);
> -	tcp_buf_send_flag(c, conn, ACK);
> +	tcp_data_from_sock(c, conn);
> +	tcp_send_flag(c, conn, ACK);
>  }
>  
>  /**
> @@ -1983,7 +2001,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
>  			conn->seq_from_tap++;
>  
>  			shutdown(conn->sock, SHUT_WR);
> -			tcp_buf_send_flag(c, conn, ACK);
> +			tcp_send_flag(c, conn, ACK);
>  			conn_event(c, conn, SOCK_FIN_SENT);
>  
>  			return 1;
> @@ -1994,7 +2012,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
>  
>  		tcp_tap_window_update(conn, ntohs(th->window));
>  
> -		tcp_buf_data_from_sock(c, conn);
> +		tcp_data_from_sock(c, conn);
>  
>  		if (p->count - idx == 1)
>  			return 1;
> @@ -2024,7 +2042,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
>  	if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
>  		shutdown(conn->sock, SHUT_WR);
>  		conn_event(c, conn, SOCK_FIN_SENT);
> -		tcp_buf_send_flag(c, conn, ACK);
> +		tcp_send_flag(c, conn, ACK);
>  		ack_due = 0;
>  	}
>  
> @@ -2058,7 +2076,7 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
>  		return;
>  	}
>  
> -	if (tcp_buf_send_flag(c, conn, SYN | ACK))
> +	if (tcp_send_flag(c, conn, SYN | ACK))
>  		return;
>  
>  	conn_event(c, conn, TAP_SYN_ACK_SENT);
> @@ -2126,7 +2144,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c,
>  
>  	conn->wnd_from_tap = WINDOW_DEFAULT;
>  
> -	tcp_buf_send_flag(c, conn, SYN);
> +	tcp_send_flag(c, conn, SYN);
>  	conn_flag(c, conn, ACK_FROM_TAP_DUE);
>  
>  	tcp_get_sndbuf(conn);
> @@ -2190,7 +2208,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
>  		return;
>  
>  	if (conn->flags & ACK_TO_TAP_DUE) {
> -		tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
> +		tcp_send_flag(c, conn, ACK_IF_NEEDED);
>  		tcp_timer_ctl(c, conn);
>  	} else if (conn->flags & ACK_FROM_TAP_DUE) {
>  		if (!(conn->events & ESTABLISHED)) {
> @@ -2206,7 +2224,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
>  			flow_dbg(conn, "ACK timeout, retry");
>  			conn->retrans++;
>  			conn->seq_to_tap = conn->seq_ack_from_tap;
> -			tcp_buf_data_from_sock(c, conn);
> +			tcp_data_from_sock(c, conn);
>  			tcp_timer_ctl(c, conn);
>  		}
>  	} else {
> @@ -2261,7 +2279,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
>  			conn_event(c, conn, SOCK_FIN_RCVD);
>  
>  		if (events & EPOLLIN)
> -			tcp_buf_data_from_sock(c, conn);
> +			tcp_data_from_sock(c, conn);
>  
>  		if (events & EPOLLOUT)
>  			tcp_update_seqack_wnd(c, conn, 0, NULL);
> diff --git a/tcp_vu.c b/tcp_vu.c
> new file mode 100644
> index 000000000000..ed59b21cabdc
> --- /dev/null
> +++ b/tcp_vu.c
> @@ -0,0 +1,447 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later

Copyright notice.


> +#include <errno.h>
> +#include <stddef.h>
> +#include <stdint.h>
> +
> +#include <netinet/ip.h>
> +
> +#include <sys/socket.h>
> +
> +#include <linux/tcp.h>
> +#include <linux/virtio_net.h>
> +
> +#include "util.h"
> +#include "ip.h"
> +#include "passt.h"
> +#include "siphash.h"
> +#include "inany.h"
> +#include "vhost_user.h"
> +#include "tcp.h"
> +#include "pcap.h"
> +#include "flow.h"
> +#include "tcp_conn.h"
> +#include "flow_table.h"
> +#include "tcp_vu.h"
> +#include "tcp_internal.h"
> +#include "checksum.h"
> +
> +#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
> +#define CONN_V6(conn)		(!CONN_V4(conn))

I don't love having these duplicated in two .c files.  However, it
might become irrelevant as I move towards having v4/v6 become implicit
in the common flow addresses.

> +/* vhost-user */
> +static const struct virtio_net_hdr vu_header = {
> +	.flags = VIRTIO_NET_HDR_F_DATA_VALID,
> +	.gso_type = VIRTIO_NET_HDR_GSO_NONE,
> +};
> +
> +static unsigned char buffer[65536];
> +static struct iovec	iov_vu			[VIRTQUEUE_MAX_SIZE];
> +static unsigned int	indexes			[VIRTQUEUE_MAX_SIZE];
> +
> +uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn)
> +{
> +	(void)conn;
> +	return USHRT_MAX;
> +}
> +
> +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
> +{
> +	VuDev *vdev = (VuDev *)&c->vdev;
> +	VuVirtqElement *elem;
> +	VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> +	struct virtio_net_hdr_mrg_rxbuf *vh;
> +	size_t tlen, vnet_hdrlen, ip_len, optlen = 0;
> +	struct ethhdr *eh;
> +	int ret;
> +	int nb_ack;
> +
> +	elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
> +	if (!elem)
> +		return 0;
> +
> +	if (elem->in_num < 1) {
> +		err("virtio-net receive queue contains no in buffers");
> +		vu_queue_rewind(vdev, vq, 1);
> +		return 0;
> +	}
> +
> +	/* Options: MSS, NOP and window scale (8 bytes) */
> +	if (flags & SYN)
> +		optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;

Given the number of subtle TCP bugs we've had to squash, it would be
really nice if we could avoid duplicating TCP logic between paths.
Could we make some abstraction that takes an iov, but can be also
called from the non-vu case with a 1-entry iov representing a single
buffer?

> +	vh = elem->in_sg[0].iov_base;
> +
> +	vh->hdr = vu_header;
> +	if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +		vh->num_buffers = htole16(1);
> +	} else {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr);
> +	}
> +	eh = (struct ethhdr *)((char *)elem->in_sg[0].iov_base + vnet_hdrlen);

Ah... hmm.. I already had hope to clean up handling different L2 and
below headers for the different "tap" types.  We basically have ugly
hacks to deal with the difference between tuntap (plain ethernet) and
qemu socket (ethernet + length header).  Now we're adding vhost-user
(ethernet + vhost header), which is a similar issue.  Abstracting this
could also make it pretty easy to support further "tap" interfaces: a
different hypervisor socket transfer with slightly different header,
tuntap in "tun" mode (raw IP without ethernet headers), SLIP or PPP,
...

> +	memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
> +	memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
> +
> +	if (CONN_V4(conn)) {
> +		struct iphdr *iph = (struct iphdr *)(eh + 1);
> +		struct tcphdr *th = (struct tcphdr *)(iph + 1);

Hmm.. did I miss logic to check that there's room for the vhost +
ethernet + IP  + TCP headers in the first iov element?

> +		char *data = (char *)(th + 1);
> +
> +		eh->h_proto = htons(ETH_P_IP);
> +
> +		*th = (struct tcphdr){
> +			.doff = sizeof(struct tcphdr) / 4,
> +			.ack = 1
> +		};
> +
> +		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
> +
> +		ret = do_tcp_send_flag(c, conn, flags, th, data, optlen);
> +		if (ret <= 0) {
> +			vu_queue_rewind(vdev, vq, 1);
> +			return ret;
> +		}
> +
> +		ip_len = ipv4_fill_headers(c, conn, iph, optlen, NULL,
> +					   conn->seq_to_tap);
> +
> +		tlen =  ip_len + sizeof(struct ethhdr);
> +
> +		if (*c->pcap) {
> +			uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP);
> +
> +			th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
> +		}
> +	} else {
> +		struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
> +		struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
> +		char *data = (char *)(th + 1);
> +
> +		eh->h_proto = htons(ETH_P_IPV6);
> +
> +		*th = (struct tcphdr){
> +			.doff = sizeof(struct tcphdr) / 4,
> +			.ack = 1
> +		};
> +
> +		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
> +
> +		ret = do_tcp_send_flag(c, conn, flags, th, data, optlen);
> +		if (ret <= 0) {
> +			vu_queue_rewind(vdev, vq, 1);
> +			return ret;
> +		}
> +
> +		ip_len = ipv6_fill_headers(c, conn, ip6h, optlen,
> +					   conn->seq_to_tap);
> +
> +		tlen =  ip_len + sizeof(struct ethhdr);
> +
> +		if (*c->pcap) {
> +			uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP);
> +
> +			th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
> +		}
> +	}
> +
> +	pcap((void *)eh, tlen);
> +
> +	tlen += vnet_hdrlen;
> +	vu_queue_fill(vdev, vq, elem, tlen, 0);
> +	nb_ack = 1;
> +
> +	if (flags & DUP_ACK) {
> +		elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
> +		if (elem) {
> +			if (elem->in_num < 1 || elem->in_sg[0].iov_len < tlen) {
> +				vu_queue_rewind(vdev, vq, 1);
> +			} else {
> +				memcpy(elem->in_sg[0].iov_base, vh, tlen);
> +				nb_ack++;
> +			}
> +		}
> +	}
> +
> +	vu_queue_flush(vdev, vq, nb_ack);
> +	vu_queue_notify(vdev, vq);
> +
> +	return 0;
> +}
> +
> +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
> +{
> +	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
> +	uint32_t already_sent;
> +	VuDev *vdev = (VuDev *)&c->vdev;
> +	VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> +	int s = conn->sock, v4 = CONN_V4(conn);
> +	int i, ret = 0, iov_count, iov_used;
> +	struct msghdr mh_sock = { 0 };
> +	size_t l2_hdrlen, vnet_hdrlen, fillsize;
> +	ssize_t len;
> +	uint16_t *check;
> +	uint16_t mss = MSS_GET(conn);
> +	int num_buffers;
> +	int segment_size;
> +	struct iovec *first;
> +	bool has_mrg_rxbuf;
> +
> +	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
> +		err("Got packet, but no available descriptors on RX virtq.");
> +		return 0;
> +	}
> +
> +	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
> +
> +	if (SEQ_LT(already_sent, 0)) {
> +		/* RFC 761, section 2.1. */
> +		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
> +			   conn->seq_ack_from_tap, conn->seq_to_tap);
> +		conn->seq_to_tap = conn->seq_ack_from_tap;
> +		already_sent = 0;
> +	}
> +
> +	if (!wnd_scaled || already_sent >= wnd_scaled) {
> +		conn_flag(c, conn, STALLED);
> +		conn_flag(c, conn, ACK_FROM_TAP_DUE);
> +		return 0;
> +	}
> +
> +	/* Set up buffer descriptors we'll fill completely and partially. */
> +
> +	fillsize = wnd_scaled;
> +
> +	iov_vu[0].iov_base = tcp_buf_discard;
> +	iov_vu[0].iov_len = already_sent;
> +	fillsize -= already_sent;
> +
> +	has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
> +	if (has_mrg_rxbuf) {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +	} else {
> +		vnet_hdrlen = sizeof(struct virtio_net_hdr);
> +	}

passt style (unlike qemu) does not put braces on 1-line blocks.

> +	l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr);

That seems like a misleading variable name.  The ethernet headers are
certainly L2.  Including the lower level headers in L2 is reasonable,
but the IP and TCP headers are L3 and L4 headers respectively.

> +	if (v4) {
> +		l2_hdrlen += sizeof(struct iphdr);
> +	} else {
> +		l2_hdrlen += sizeof(struct ipv6hdr);
> +	}
> +
> +	iov_count = 0;
> +	segment_size = 0;
> +	while (fillsize > 0 && iov_count < VIRTQUEUE_MAX_SIZE - 1) {
> +		VuVirtqElement *elem;
> +
> +		elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
> +		if (!elem)
> +			break;
> +
> +		if (elem->in_num < 1) {
> +			err("virtio-net receive queue contains no in buffers");
> +			goto err;
> +		}
> +
> +		ASSERT(elem->in_num == 1);
> +		ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen);
> +
> +		indexes[iov_count] = elem->index;
> +
> +		if (segment_size == 0) {
> +			iov_vu[iov_count + 1].iov_base =
> +					(char *)elem->in_sg[0].iov_base + l2_hdrlen;
> +			iov_vu[iov_count + 1].iov_len =
> +					elem->in_sg[0].iov_len - l2_hdrlen;
> +		} else {
> +			iov_vu[iov_count + 1].iov_base = elem->in_sg[0].iov_base;
> +			iov_vu[iov_count + 1].iov_len = elem->in_sg[0].iov_len;
> +		}
> +
> +		if (iov_vu[iov_count + 1].iov_len > fillsize)
> +			 iov_vu[iov_count + 1].iov_len = fillsize;
> +
> +		segment_size += iov_vu[iov_count + 1].iov_len;
> +		if (!has_mrg_rxbuf) {
> +			segment_size = 0;
> +		} else if (segment_size >= mss) {
> +			iov_vu[iov_count + 1].iov_len -= segment_size - mss;
> +			segment_size = 0;
> +		}
> +		fillsize -= iov_vu[iov_count + 1].iov_len;
> +
> +		iov_count++;
> +	}
> +	if (iov_count == 0)
> +		return 0;
> +
> +	mh_sock.msg_iov = iov_vu;
> +	mh_sock.msg_iovlen = iov_count + 1;
> +
> +	do
> +		len = recvmsg(s, &mh_sock, MSG_PEEK);
> +	while (len < 0 && errno == EINTR);
> +
> +	if (len < 0)
> +		goto err;
> +
> +	if (!len) {
> +		vu_queue_rewind(vdev, vq, iov_count);
> +		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
> +			if ((ret = tcp_vu_send_flag(c, conn, FIN | ACK))) {
> +				tcp_rst(c, conn);
> +				return ret;
> +			}
> +
> +			conn_event(c, conn, TAP_FIN_SENT);
> +		}
> +
> +		return 0;
> +	}
> +
> +	len -= already_sent;
> +	if (len <= 0) {
> +		conn_flag(c, conn, STALLED);
> +		vu_queue_rewind(vdev, vq, iov_count);
> +		return 0;
> +	}
> +
> +	conn_flag(c, conn, ~STALLED);
> +
> +	/* Likely, some new data was acked too. */
> +	tcp_update_seqack_wnd(c, conn, 0, NULL);
> +
> +	/* initialize headers */
> +	iov_used = 0;
> +	num_buffers = 0;
> +	check = NULL;
> +	segment_size = 0;
> +	for (i = 0; i < iov_count && len; i++) {
> +
> +		if (segment_size == 0)
> +			first = &iov_vu[i + 1];
> +
> +		if (iov_vu[i + 1].iov_len > (size_t)len)
> +			iov_vu[i + 1].iov_len = len;
> +
> +		len -= iov_vu[i + 1].iov_len;
> +		iov_used++;
> +
> +		segment_size += iov_vu[i + 1].iov_len;
> +		num_buffers++;
> +
> +		if (segment_size >= mss || len == 0 ||
> +		    i + 1 == iov_count || !has_mrg_rxbuf) {
> +
> +			struct ethhdr *eh;
> +			struct virtio_net_hdr_mrg_rxbuf *vh;
> +			char *base = (char *)first->iov_base - l2_hdrlen;
> +			size_t size = first->iov_len + l2_hdrlen;
> +
> +			vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
> +
> +			vh->hdr = vu_header;
> +			if (has_mrg_rxbuf)
> +				vh->num_buffers = htole16(num_buffers);
> +
> +			eh = (struct ethhdr *)((char *)base + vnet_hdrlen);
> +
> +			memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
> +			memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
> +
> +			/* initialize header */
> +			if (v4) {
> +				struct iphdr *iph = (struct iphdr *)(eh + 1);
> +				struct tcphdr *th = (struct tcphdr *)(iph + 1);
> +
> +				eh->h_proto = htons(ETH_P_IP);
> +
> +				*th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
> +
> +				*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
> +
> +				ipv4_fill_headers(c, conn, iph, segment_size,
> +						len ? check : NULL, conn->seq_to_tap);
> +
> +				if (*c->pcap) {
> +					uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP);
> +
> +					first->iov_base = th;
> +					first->iov_len = size - l2_hdrlen + sizeof(*th);
> +
> +					th->check = csum_iov(first, num_buffers, sum);
> +				}
> +
> +				check = &iph->check;
> +			} else {
> +				struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
> +				struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
> +
> +				eh->h_proto = htons(ETH_P_IPV6);
> +
> +				*th = (struct tcphdr){
> +					.doff = sizeof(struct tcphdr) / 4,
> +					.ack = 1
> +				};
> +
> +				*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
> +
> +				ipv6_fill_headers(c, conn, ip6h, segment_size,
> +						conn->seq_to_tap);
> +				if (*c->pcap) {
> +					uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP);
> +
> +					first->iov_base = th;
> +					first->iov_len = size - l2_hdrlen + sizeof(*th);
> +
> +					th->check = csum_iov(first, num_buffers, sum);
> +				}
> +			}
> +
> +			/* set iov for pcap logging */
> +			first->iov_base = eh;
> +			first->iov_len = size - vnet_hdrlen;
> +
> +			pcap_iov(first, num_buffers);
> +
> +			/* set iov_len for vu_queue_fill_by_index(); */
> +
> +			first->iov_base = base;
> +			first->iov_len = size;
> +
> +			conn->seq_to_tap += segment_size;
> +
> +			segment_size = 0;
> +			num_buffers = 0;
> +		}
> +	}
> +
> +	/* release unused buffers */
> +	vu_queue_rewind(vdev, vq, iov_count - iov_used);
> +
> +	/* send packets */
> +	for (i = 0; i < iov_used; i++) {
> +		vu_queue_fill_by_index(vdev, vq, indexes[i],
> +				       iov_vu[i + 1].iov_len, i);
> +	}
> +
> +	vu_queue_flush(vdev, vq, iov_used);
> +	vu_queue_notify(vdev, vq);
> +
> +	conn_flag(c, conn, ACK_FROM_TAP_DUE);
> +
> +	return 0;
> +err:
> +	vu_queue_rewind(vdev, vq, iov_count);
> +
> +	if (errno != EAGAIN && errno != EWOULDBLOCK) {
> +		ret = -errno;
> +		tcp_rst(c, conn);
> +	}
> +
> +	return ret;
> +}
> diff --git a/tcp_vu.h b/tcp_vu.h
> new file mode 100644
> index 000000000000..8045a6e3edb8
> --- /dev/null
> +++ b/tcp_vu.h
> @@ -0,0 +1,10 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +#ifndef TCP_VU_H
> +#define TCP_VU_H
> +
> +uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn);
> +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
> +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
> +
> +#endif  /*TCP_VU_H */

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

next prev parent reply	other threads:[~2024-02-09  5:01 UTC|newest]

Thread overview: 83+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-02 14:11 [PATCH 00/24] Add vhost-user support to passt Laurent Vivier
2024-02-02 14:11 ` [PATCH 01/24] iov: add some functions to manage iovec Laurent Vivier
2024-02-05  5:57   ` David Gibson
2024-02-06 14:28     ` Laurent Vivier
2024-02-07  1:01       ` David Gibson
2024-02-07 10:00         ` Laurent Vivier
2024-02-06 16:10   ` Stefano Brivio
2024-02-07 14:02     ` Laurent Vivier
2024-02-07 14:57       ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 02/24] pcap: add pcap_iov() Laurent Vivier
2024-02-05  6:25   ` David Gibson
2024-02-06 16:10   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 03/24] checksum: align buffers Laurent Vivier
2024-02-05  6:02   ` David Gibson
2024-02-07  9:01     ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 04/24] checksum: add csum_iov() Laurent Vivier
2024-02-05  6:07   ` David Gibson
2024-02-07  9:02   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 05/24] util: move IP stuff from util.[ch] to ip.[ch] Laurent Vivier
2024-02-05  6:13   ` David Gibson
2024-02-07  9:03     ` Stefano Brivio
2024-02-08  0:04       ` David Gibson
2024-02-02 14:11 ` [PATCH 06/24] ip: move duplicate IPv4 checksum function to ip.h Laurent Vivier
2024-02-05  6:16   ` David Gibson
2024-02-07 10:40   ` Stefano Brivio
2024-02-07 23:43     ` David Gibson
2024-02-02 14:11 ` [PATCH 07/24] ip: introduce functions to compute the header part checksum for TCP/UDP Laurent Vivier
2024-02-05  6:20   ` David Gibson
2024-02-07 10:41   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 08/24] tcp: extract buffer management from tcp_send_flag() Laurent Vivier
2024-02-06  0:24   ` David Gibson
2024-02-08 16:57   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 09/24] tcp: extract buffer management from tcp_conn_tap_mss() Laurent Vivier
2024-02-06  0:47   ` David Gibson
2024-02-08 16:59   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 10/24] tcp: rename functions that manage buffers Laurent Vivier
2024-02-06  1:48   ` David Gibson
2024-02-08 17:10     ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 11/24] tcp: move buffers management functions to their own file Laurent Vivier
2024-02-02 14:11 ` [PATCH 12/24] tap: make tap_update_mac() generic Laurent Vivier
2024-02-06  1:49   ` David Gibson
2024-02-08 17:10     ` Stefano Brivio
2024-02-09  5:02       ` David Gibson
2024-02-02 14:11 ` [PATCH 13/24] tap: export pool_flush()/tapX_handler()/packet_add() Laurent Vivier
2024-02-02 14:29   ` Laurent Vivier
2024-02-06  1:52   ` David Gibson
2024-02-11 23:15   ` Stefano Brivio
2024-02-12  2:22     ` David Gibson
2024-02-02 14:11 ` [PATCH 14/24] udp: move udpX_l2_buf_t and udpX_l2_mh_sock out of udp_update_hdrX() Laurent Vivier
2024-02-06  1:59   ` David Gibson
2024-02-11 23:16   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 15/24] udp: rename udp_sock_handler() to udp_buf_sock_handler() Laurent Vivier
2024-02-06  2:14   ` David Gibson
2024-02-11 23:17     ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 16/24] packet: replace struct desc by struct iovec Laurent Vivier
2024-02-06  2:25   ` David Gibson
2024-02-11 23:18     ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 17/24] vhost-user: compare mode MODE_PASTA and not MODE_PASST Laurent Vivier
2024-02-06  2:29   ` David Gibson
2024-02-02 14:11 ` [PATCH 18/24] vhost-user: introduce virtio API Laurent Vivier
2024-02-06  3:51   ` David Gibson
2024-02-11 23:18     ` Stefano Brivio
2024-02-12  2:26       ` David Gibson
2024-02-02 14:11 ` [PATCH 19/24] vhost-user: introduce vhost-user API Laurent Vivier
2024-02-07  2:13   ` David Gibson
2024-02-02 14:11 ` [PATCH 20/24] vhost-user: add vhost-user Laurent Vivier
2024-02-07  2:40   ` David Gibson
2024-02-11 23:19     ` Stefano Brivio
2024-02-12  2:47       ` David Gibson
2024-02-13 15:22         ` Stefano Brivio
2024-02-14  2:05           ` David Gibson
2024-02-11 23:19   ` Stefano Brivio
2024-02-12  2:49     ` David Gibson
2024-02-12 10:02       ` Laurent Vivier
2024-02-12 16:56         ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 21/24] vhost-user: use guest buffer directly in vu_handle_tx() Laurent Vivier
2024-02-09  4:26   ` David Gibson
2024-02-02 14:11 ` [PATCH 22/24] tcp: vhost-user RX nocopy Laurent Vivier
2024-02-09  4:57   ` David Gibson [this message]
2024-02-02 14:11 ` [PATCH 23/24] udp: " Laurent Vivier
2024-02-09  5:00   ` David Gibson
2024-02-02 14:11 ` [PATCH 24/24] vhost-user: remove tap_send_frames_vu() Laurent Vivier
2024-02-09  5:01   ` David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZcWwz80xHORGbY5F@zatzit \
    --to=david@gibson.dropbear.id.au \
    --cc=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).