public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Laurent Vivier <lvivier@redhat.com>
To: passt-dev@passt.top
Cc: Laurent Vivier <lvivier@redhat.com>
Subject: [PATCH 22/24] tcp: vhost-user RX nocopy
Date: Fri,  2 Feb 2024 15:11:49 +0100	[thread overview]
Message-ID: <20240202141151.3762941-23-lvivier@redhat.com> (raw)
In-Reply-To: <20240202141151.3762941-1-lvivier@redhat.com>

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
 Makefile |   6 +-
 tcp.c    |  66 +++++---
 tcp_vu.c | 447 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 tcp_vu.h |  10 ++
 4 files changed, 502 insertions(+), 27 deletions(-)
 create mode 100644 tcp_vu.c
 create mode 100644 tcp_vu.h

diff --git a/Makefile b/Makefile
index 2016b071ddf2..f7a403d19b61 100644
--- a/Makefile
+++ b/Makefile
@@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
 PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c icmp.c \
 	igmp.c isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c \
 	passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c \
-	tcp_buf.c udp.c util.c iov.c ip.c virtio.c vhost_user.c
+	tcp_buf.c tcp_vu.c udp.c util.c iov.c ip.c virtio.c vhost_user.c
 QRAP_SRCS = qrap.c
 SRCS = $(PASST_SRCS) $(QRAP_SRCS)
 
@@ -56,8 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h \
 	flow_table.h icmp.h inany.h isolation.h lineread.h log.h ndp.h \
 	netlink.h packet.h passt.h pasta.h pcap.h pif.h port_fwd.h siphash.h \
-	tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_internal.h udp.h \
-	util.h iov.h ip.h virtio.h vhost_user.h
+	tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_vu.h tcp_internal.h \
+	udp.h util.h iov.h ip.h virtio.h vhost_user.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
 C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
diff --git a/tcp.c b/tcp.c
index b6aca9f37f19..e829e12fe7c2 100644
--- a/tcp.c
+++ b/tcp.c
@@ -302,6 +302,7 @@
 #include "flow_table.h"
 #include "tcp_internal.h"
 #include "tcp_buf.h"
+#include "tcp_vu.h"
 
 /* Sides of a flow as we use them in "tap" connections */
 #define	SOCKSIDE	0
@@ -1034,7 +1035,7 @@ size_t ipv4_fill_headers(const struct ctx *c,
 	tcp_set_tcp_header(th, conn, seq);
 
 	th->check = 0;
-	if (c->mode != MODE_VU || *c->pcap)
+	if (c->mode != MODE_VU)
 		th->check = tcp_update_check_tcp4(iph);
 
 	return ip_len;
@@ -1072,7 +1073,7 @@ size_t ipv6_fill_headers(const struct ctx *c,
 	tcp_set_tcp_header(th, conn, seq);
 
 	th->check = 0;
-	if (c->mode != MODE_VU || *c->pcap)
+	if (c->mode != MODE_VU)
 		th->check = tcp_update_check_tcp6(ip6h);
 
 	ip6h->hop_limit = 255;
@@ -1302,6 +1303,12 @@ int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags,
 	return 1;
 }
 
+int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+	if (c->mode == MODE_VU)
+		return tcp_vu_send_flag(c, conn, flags);
+	return tcp_buf_send_flag(c, conn, flags);
+}
 
 /**
  * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
@@ -1313,7 +1320,7 @@ void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
 	if (conn->events == CLOSED)
 		return;
 
-	if (!tcp_buf_send_flag(c, conn, RST))
+	if (!tcp_send_flag(c, conn, RST))
 		conn_event(c, conn, CLOSED);
 }
 
@@ -1430,7 +1437,8 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
  *
  * Return: clamped MSS value
  */
-static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
+static uint16_t tcp_conn_tap_mss(const struct ctx *c,
+				 const struct tcp_tap_conn *conn,
 				 const char *opts, size_t optlen)
 {
 	unsigned int mss;
@@ -1441,7 +1449,10 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
 	else
 		mss = ret;
 
-	mss = MIN(tcp_buf_conn_tap_mss(conn), mss);
+	if (c->mode == MODE_VU)
+		mss = MIN(tcp_vu_conn_tap_mss(conn), mss);
+	else
+		mss = MIN(tcp_buf_conn_tap_mss(conn), mss);
 
 	return MIN(mss, USHRT_MAX);
 }
@@ -1568,7 +1579,7 @@ static void tcp_conn_from_tap(struct ctx *c,
 
 	conn->wnd_to_tap = WINDOW_DEFAULT;
 
-	mss = tcp_conn_tap_mss(conn, opts, optlen);
+	mss = tcp_conn_tap_mss(c, conn, opts, optlen);
 	if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)))
 		flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s);
 	MSS_SET(conn, mss);
@@ -1625,7 +1636,7 @@ static void tcp_conn_from_tap(struct ctx *c,
 	} else {
 		tcp_get_sndbuf(conn);
 
-		if (tcp_buf_send_flag(c, conn, SYN | ACK))
+		if (tcp_send_flag(c, conn, SYN | ACK))
 			return;
 
 		conn_event(c, conn, TAP_SYN_ACK_SENT);
@@ -1673,6 +1684,13 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
 	return 0;
 }
 
+static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	if (c->mode == MODE_VU)
+		return tcp_vu_data_from_sock(c, conn);
+
+	return tcp_buf_data_from_sock(c, conn);
+}
 
 /**
  * tcp_data_from_tap() - tap/guest data for established connection
@@ -1806,7 +1824,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn,
 			   max_ack_seq, conn->seq_to_tap);
 		conn->seq_ack_from_tap = max_ack_seq;
 		conn->seq_to_tap = max_ack_seq;
-		tcp_buf_data_from_sock(c, conn);
+		tcp_data_from_sock(c, conn);
 	}
 
 	if (!iov_i)
@@ -1822,14 +1840,14 @@ eintr:
 			 *   Then swiftly looked away and left.
 			 */
 			conn->seq_from_tap = seq_from_tap;
-			tcp_buf_send_flag(c, conn, ACK);
+			tcp_send_flag(c, conn, ACK);
 		}
 
 		if (errno == EINTR)
 			goto eintr;
 
 		if (errno == EAGAIN || errno == EWOULDBLOCK) {
-			tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
+			tcp_send_flag(c, conn, ACK_IF_NEEDED);
 			return p->count - idx;
 
 		}
@@ -1839,7 +1857,7 @@ eintr:
 	if (n < (int)(seq_from_tap - conn->seq_from_tap)) {
 		partial_send = 1;
 		conn->seq_from_tap += n;
-		tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
+		tcp_send_flag(c, conn, ACK_IF_NEEDED);
 	} else {
 		conn->seq_from_tap += n;
 	}
@@ -1852,7 +1870,7 @@ out:
 		 */
 		if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) {
 			conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff;
-			tcp_buf_send_flag(c, conn, DUP_ACK);
+			tcp_send_flag(c, conn, DUP_ACK);
 		}
 		return p->count - idx;
 	}
@@ -1866,7 +1884,7 @@ out:
 
 		conn_event(c, conn, TAP_FIN_RCVD);
 	} else {
-		tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
+		tcp_send_flag(c, conn, ACK_IF_NEEDED);
 	}
 
 	return p->count - idx;
@@ -1891,7 +1909,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	if (!(conn->wnd_from_tap >>= conn->ws_from_tap))
 		conn->wnd_from_tap = 1;
 
-	MSS_SET(conn, tcp_conn_tap_mss(conn, opts, optlen));
+	MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen));
 
 	conn->seq_init_from_tap = ntohl(th->seq) + 1;
 	conn->seq_from_tap = conn->seq_init_from_tap;
@@ -1902,8 +1920,8 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn,
 	/* The client might have sent data already, which we didn't
 	 * dequeue waiting for SYN,ACK from tap -- check now.
 	 */
-	tcp_buf_data_from_sock(c, conn);
-	tcp_buf_send_flag(c, conn, ACK);
+	tcp_data_from_sock(c, conn);
+	tcp_send_flag(c, conn, ACK);
 }
 
 /**
@@ -1983,7 +2001,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
 			conn->seq_from_tap++;
 
 			shutdown(conn->sock, SHUT_WR);
-			tcp_buf_send_flag(c, conn, ACK);
+			tcp_send_flag(c, conn, ACK);
 			conn_event(c, conn, SOCK_FIN_SENT);
 
 			return 1;
@@ -1994,7 +2012,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
 
 		tcp_tap_window_update(conn, ntohs(th->window));
 
-		tcp_buf_data_from_sock(c, conn);
+		tcp_data_from_sock(c, conn);
 
 		if (p->count - idx == 1)
 			return 1;
@@ -2024,7 +2042,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af,
 	if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) {
 		shutdown(conn->sock, SHUT_WR);
 		conn_event(c, conn, SOCK_FIN_SENT);
-		tcp_buf_send_flag(c, conn, ACK);
+		tcp_send_flag(c, conn, ACK);
 		ack_due = 0;
 	}
 
@@ -2058,7 +2076,7 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn)
 		return;
 	}
 
-	if (tcp_buf_send_flag(c, conn, SYN | ACK))
+	if (tcp_send_flag(c, conn, SYN | ACK))
 		return;
 
 	conn_event(c, conn, TAP_SYN_ACK_SENT);
@@ -2126,7 +2144,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c,
 
 	conn->wnd_from_tap = WINDOW_DEFAULT;
 
-	tcp_buf_send_flag(c, conn, SYN);
+	tcp_send_flag(c, conn, SYN);
 	conn_flag(c, conn, ACK_FROM_TAP_DUE);
 
 	tcp_get_sndbuf(conn);
@@ -2190,7 +2208,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 		return;
 
 	if (conn->flags & ACK_TO_TAP_DUE) {
-		tcp_buf_send_flag(c, conn, ACK_IF_NEEDED);
+		tcp_send_flag(c, conn, ACK_IF_NEEDED);
 		tcp_timer_ctl(c, conn);
 	} else if (conn->flags & ACK_FROM_TAP_DUE) {
 		if (!(conn->events & ESTABLISHED)) {
@@ -2206,7 +2224,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref)
 			flow_dbg(conn, "ACK timeout, retry");
 			conn->retrans++;
 			conn->seq_to_tap = conn->seq_ack_from_tap;
-			tcp_buf_data_from_sock(c, conn);
+			tcp_data_from_sock(c, conn);
 			tcp_timer_ctl(c, conn);
 		}
 	} else {
@@ -2261,7 +2279,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events)
 			conn_event(c, conn, SOCK_FIN_RCVD);
 
 		if (events & EPOLLIN)
-			tcp_buf_data_from_sock(c, conn);
+			tcp_data_from_sock(c, conn);
 
 		if (events & EPOLLOUT)
 			tcp_update_seqack_wnd(c, conn, 0, NULL);
diff --git a/tcp_vu.c b/tcp_vu.c
new file mode 100644
index 000000000000..ed59b21cabdc
--- /dev/null
+++ b/tcp_vu.c
@@ -0,0 +1,447 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <errno.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <netinet/ip.h>
+
+#include <sys/socket.h>
+
+#include <linux/tcp.h>
+#include <linux/virtio_net.h>
+
+#include "util.h"
+#include "ip.h"
+#include "passt.h"
+#include "siphash.h"
+#include "inany.h"
+#include "vhost_user.h"
+#include "tcp.h"
+#include "pcap.h"
+#include "flow.h"
+#include "tcp_conn.h"
+#include "flow_table.h"
+#include "tcp_vu.h"
+#include "tcp_internal.h"
+#include "checksum.h"
+
+#define CONN_V4(conn)		(!!inany_v4(&(conn)->faddr))
+#define CONN_V6(conn)		(!CONN_V4(conn))
+
+/* vhost-user */
+static const struct virtio_net_hdr vu_header = {
+	.flags = VIRTIO_NET_HDR_F_DATA_VALID,
+	.gso_type = VIRTIO_NET_HDR_GSO_NONE,
+};
+
+static unsigned char buffer[65536];
+static struct iovec	iov_vu			[VIRTQUEUE_MAX_SIZE];
+static unsigned int	indexes			[VIRTQUEUE_MAX_SIZE];
+
+uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn)
+{
+	(void)conn;
+	return USHRT_MAX;
+}
+
+int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+	VuDev *vdev = (VuDev *)&c->vdev;
+	VuVirtqElement *elem;
+	VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	struct virtio_net_hdr_mrg_rxbuf *vh;
+	size_t tlen, vnet_hdrlen, ip_len, optlen = 0;
+	struct ethhdr *eh;
+	int ret;
+	int nb_ack;
+
+	elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
+	if (!elem)
+		return 0;
+
+	if (elem->in_num < 1) {
+		err("virtio-net receive queue contains no in buffers");
+		vu_queue_rewind(vdev, vq, 1);
+		return 0;
+	}
+
+	/* Options: MSS, NOP and window scale (8 bytes) */
+	if (flags & SYN)
+		optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
+
+	vh = elem->in_sg[0].iov_base;
+
+	vh->hdr = vu_header;
+	if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
+		vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+		vh->num_buffers = htole16(1);
+	} else {
+		vnet_hdrlen = sizeof(struct virtio_net_hdr);
+	}
+	eh = (struct ethhdr *)((char *)elem->in_sg[0].iov_base + vnet_hdrlen);
+
+	memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
+	memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
+
+	if (CONN_V4(conn)) {
+		struct iphdr *iph = (struct iphdr *)(eh + 1);
+		struct tcphdr *th = (struct tcphdr *)(iph + 1);
+		char *data = (char *)(th + 1);
+
+		eh->h_proto = htons(ETH_P_IP);
+
+		*th = (struct tcphdr){
+			.doff = sizeof(struct tcphdr) / 4,
+			.ack = 1
+		};
+
+		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+
+		ret = do_tcp_send_flag(c, conn, flags, th, data, optlen);
+		if (ret <= 0) {
+			vu_queue_rewind(vdev, vq, 1);
+			return ret;
+		}
+
+		ip_len = ipv4_fill_headers(c, conn, iph, optlen, NULL,
+					   conn->seq_to_tap);
+
+		tlen =  ip_len + sizeof(struct ethhdr);
+
+		if (*c->pcap) {
+			uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP);
+
+			th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
+		}
+	} else {
+		struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
+		struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
+		char *data = (char *)(th + 1);
+
+		eh->h_proto = htons(ETH_P_IPV6);
+
+		*th = (struct tcphdr){
+			.doff = sizeof(struct tcphdr) / 4,
+			.ack = 1
+		};
+
+		*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
+
+		ret = do_tcp_send_flag(c, conn, flags, th, data, optlen);
+		if (ret <= 0) {
+			vu_queue_rewind(vdev, vq, 1);
+			return ret;
+		}
+
+		ip_len = ipv6_fill_headers(c, conn, ip6h, optlen,
+					   conn->seq_to_tap);
+
+		tlen =  ip_len + sizeof(struct ethhdr);
+
+		if (*c->pcap) {
+			uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP);
+
+			th->check = csum(th, optlen + sizeof(struct tcphdr), sum);
+		}
+	}
+
+	pcap((void *)eh, tlen);
+
+	tlen += vnet_hdrlen;
+	vu_queue_fill(vdev, vq, elem, tlen, 0);
+	nb_ack = 1;
+
+	if (flags & DUP_ACK) {
+		elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
+		if (elem) {
+			if (elem->in_num < 1 || elem->in_sg[0].iov_len < tlen) {
+				vu_queue_rewind(vdev, vq, 1);
+			} else {
+				memcpy(elem->in_sg[0].iov_base, vh, tlen);
+				nb_ack++;
+			}
+		}
+	}
+
+	vu_queue_flush(vdev, vq, nb_ack);
+	vu_queue_notify(vdev, vq);
+
+	return 0;
+}
+
+int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+{
+	uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
+	uint32_t already_sent;
+	VuDev *vdev = (VuDev *)&c->vdev;
+	VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
+	int s = conn->sock, v4 = CONN_V4(conn);
+	int i, ret = 0, iov_count, iov_used;
+	struct msghdr mh_sock = { 0 };
+	size_t l2_hdrlen, vnet_hdrlen, fillsize;
+	ssize_t len;
+	uint16_t *check;
+	uint16_t mss = MSS_GET(conn);
+	int num_buffers;
+	int segment_size;
+	struct iovec *first;
+	bool has_mrg_rxbuf;
+
+	if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) {
+		err("Got packet, but no available descriptors on RX virtq.");
+		return 0;
+	}
+
+	already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
+
+	if (SEQ_LT(already_sent, 0)) {
+		/* RFC 761, section 2.1. */
+		flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
+			   conn->seq_ack_from_tap, conn->seq_to_tap);
+		conn->seq_to_tap = conn->seq_ack_from_tap;
+		already_sent = 0;
+	}
+
+	if (!wnd_scaled || already_sent >= wnd_scaled) {
+		conn_flag(c, conn, STALLED);
+		conn_flag(c, conn, ACK_FROM_TAP_DUE);
+		return 0;
+	}
+
+	/* Set up buffer descriptors we'll fill completely and partially. */
+
+	fillsize = wnd_scaled;
+
+	iov_vu[0].iov_base = tcp_buf_discard;
+	iov_vu[0].iov_len = already_sent;
+	fillsize -= already_sent;
+
+	has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF);
+	if (has_mrg_rxbuf) {
+		vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	} else {
+		vnet_hdrlen = sizeof(struct virtio_net_hdr);
+	}
+	l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr);
+	if (v4) {
+		l2_hdrlen += sizeof(struct iphdr);
+	} else {
+		l2_hdrlen += sizeof(struct ipv6hdr);
+	}
+
+	iov_count = 0;
+	segment_size = 0;
+	while (fillsize > 0 && iov_count < VIRTQUEUE_MAX_SIZE - 1) {
+		VuVirtqElement *elem;
+
+		elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer);
+		if (!elem)
+			break;
+
+		if (elem->in_num < 1) {
+			err("virtio-net receive queue contains no in buffers");
+			goto err;
+		}
+
+		ASSERT(elem->in_num == 1);
+		ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen);
+
+		indexes[iov_count] = elem->index;
+
+		if (segment_size == 0) {
+			iov_vu[iov_count + 1].iov_base =
+					(char *)elem->in_sg[0].iov_base + l2_hdrlen;
+			iov_vu[iov_count + 1].iov_len =
+					elem->in_sg[0].iov_len - l2_hdrlen;
+		} else {
+			iov_vu[iov_count + 1].iov_base = elem->in_sg[0].iov_base;
+			iov_vu[iov_count + 1].iov_len = elem->in_sg[0].iov_len;
+		}
+
+		if (iov_vu[iov_count + 1].iov_len > fillsize)
+			 iov_vu[iov_count + 1].iov_len = fillsize;
+
+		segment_size += iov_vu[iov_count + 1].iov_len;
+		if (!has_mrg_rxbuf) {
+			segment_size = 0;
+		} else if (segment_size >= mss) {
+			iov_vu[iov_count + 1].iov_len -= segment_size - mss;
+			segment_size = 0;
+		}
+		fillsize -= iov_vu[iov_count + 1].iov_len;
+
+		iov_count++;
+	}
+	if (iov_count == 0)
+		return 0;
+
+	mh_sock.msg_iov = iov_vu;
+	mh_sock.msg_iovlen = iov_count + 1;
+
+	do
+		len = recvmsg(s, &mh_sock, MSG_PEEK);
+	while (len < 0 && errno == EINTR);
+
+	if (len < 0)
+		goto err;
+
+	if (!len) {
+		vu_queue_rewind(vdev, vq, iov_count);
+		if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
+			if ((ret = tcp_vu_send_flag(c, conn, FIN | ACK))) {
+				tcp_rst(c, conn);
+				return ret;
+			}
+
+			conn_event(c, conn, TAP_FIN_SENT);
+		}
+
+		return 0;
+	}
+
+	len -= already_sent;
+	if (len <= 0) {
+		conn_flag(c, conn, STALLED);
+		vu_queue_rewind(vdev, vq, iov_count);
+		return 0;
+	}
+
+	conn_flag(c, conn, ~STALLED);
+
+	/* Likely, some new data was acked too. */
+	tcp_update_seqack_wnd(c, conn, 0, NULL);
+
+	/* initialize headers */
+	iov_used = 0;
+	num_buffers = 0;
+	check = NULL;
+	segment_size = 0;
+	for (i = 0; i < iov_count && len; i++) {
+
+		if (segment_size == 0)
+			first = &iov_vu[i + 1];
+
+		if (iov_vu[i + 1].iov_len > (size_t)len)
+			iov_vu[i + 1].iov_len = len;
+
+		len -= iov_vu[i + 1].iov_len;
+		iov_used++;
+
+		segment_size += iov_vu[i + 1].iov_len;
+		num_buffers++;
+
+		if (segment_size >= mss || len == 0 ||
+		    i + 1 == iov_count || !has_mrg_rxbuf) {
+
+			struct ethhdr *eh;
+			struct virtio_net_hdr_mrg_rxbuf *vh;
+			char *base = (char *)first->iov_base - l2_hdrlen;
+			size_t size = first->iov_len + l2_hdrlen;
+
+			vh = (struct virtio_net_hdr_mrg_rxbuf *)base;
+
+			vh->hdr = vu_header;
+			if (has_mrg_rxbuf)
+				vh->num_buffers = htole16(num_buffers);
+
+			eh = (struct ethhdr *)((char *)base + vnet_hdrlen);
+
+			memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest));
+			memcpy(eh->h_source, c->mac, sizeof(eh->h_source));
+
+			/* initialize header */
+			if (v4) {
+				struct iphdr *iph = (struct iphdr *)(eh + 1);
+				struct tcphdr *th = (struct tcphdr *)(iph + 1);
+
+				eh->h_proto = htons(ETH_P_IP);
+
+				*th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
+
+				*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP);
+
+				ipv4_fill_headers(c, conn, iph, segment_size,
+						len ? check : NULL, conn->seq_to_tap);
+
+				if (*c->pcap) {
+					uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP);
+
+					first->iov_base = th;
+					first->iov_len = size - l2_hdrlen + sizeof(*th);
+
+					th->check = csum_iov(first, num_buffers, sum);
+				}
+
+				check = &iph->check;
+			} else {
+				struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1);
+				struct tcphdr *th = (struct tcphdr *)(ip6h + 1);
+
+				eh->h_proto = htons(ETH_P_IPV6);
+
+				*th = (struct tcphdr){
+					.doff = sizeof(struct tcphdr) / 4,
+					.ack = 1
+				};
+
+				*ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP);
+
+				ipv6_fill_headers(c, conn, ip6h, segment_size,
+						conn->seq_to_tap);
+				if (*c->pcap) {
+					uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP);
+
+					first->iov_base = th;
+					first->iov_len = size - l2_hdrlen + sizeof(*th);
+
+					th->check = csum_iov(first, num_buffers, sum);
+				}
+			}
+
+			/* set iov for pcap logging */
+			first->iov_base = eh;
+			first->iov_len = size - vnet_hdrlen;
+
+			pcap_iov(first, num_buffers);
+
+			/* set iov_len for vu_queue_fill_by_index(); */
+
+			first->iov_base = base;
+			first->iov_len = size;
+
+			conn->seq_to_tap += segment_size;
+
+			segment_size = 0;
+			num_buffers = 0;
+		}
+	}
+
+	/* release unused buffers */
+	vu_queue_rewind(vdev, vq, iov_count - iov_used);
+
+	/* send packets */
+	for (i = 0; i < iov_used; i++) {
+		vu_queue_fill_by_index(vdev, vq, indexes[i],
+				       iov_vu[i + 1].iov_len, i);
+	}
+
+	vu_queue_flush(vdev, vq, iov_used);
+	vu_queue_notify(vdev, vq);
+
+	conn_flag(c, conn, ACK_FROM_TAP_DUE);
+
+	return 0;
+err:
+	vu_queue_rewind(vdev, vq, iov_count);
+
+	if (errno != EAGAIN && errno != EWOULDBLOCK) {
+		ret = -errno;
+		tcp_rst(c, conn);
+	}
+
+	return ret;
+}
diff --git a/tcp_vu.h b/tcp_vu.h
new file mode 100644
index 000000000000..8045a6e3edb8
--- /dev/null
+++ b/tcp_vu.h
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#ifndef TCP_VU_H
+#define TCP_VU_H
+
+uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn);
+int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
+int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
+
+#endif  /*TCP_VU_H */
-- 
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#ifndef TCP_VU_H
+#define TCP_VU_H
+
+uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn);
+int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
+int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
+
+#endif  /*TCP_VU_H */
-- 
2.42.0


  parent reply	other threads:[~2024-02-02 14:11 UTC|newest]

Thread overview: 83+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-02 14:11 [PATCH 00/24] Add vhost-user support to passt Laurent Vivier
2024-02-02 14:11 ` [PATCH 01/24] iov: add some functions to manage iovec Laurent Vivier
2024-02-05  5:57   ` David Gibson
2024-02-06 14:28     ` Laurent Vivier
2024-02-07  1:01       ` David Gibson
2024-02-07 10:00         ` Laurent Vivier
2024-02-06 16:10   ` Stefano Brivio
2024-02-07 14:02     ` Laurent Vivier
2024-02-07 14:57       ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 02/24] pcap: add pcap_iov() Laurent Vivier
2024-02-05  6:25   ` David Gibson
2024-02-06 16:10   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 03/24] checksum: align buffers Laurent Vivier
2024-02-05  6:02   ` David Gibson
2024-02-07  9:01     ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 04/24] checksum: add csum_iov() Laurent Vivier
2024-02-05  6:07   ` David Gibson
2024-02-07  9:02   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 05/24] util: move IP stuff from util.[ch] to ip.[ch] Laurent Vivier
2024-02-05  6:13   ` David Gibson
2024-02-07  9:03     ` Stefano Brivio
2024-02-08  0:04       ` David Gibson
2024-02-02 14:11 ` [PATCH 06/24] ip: move duplicate IPv4 checksum function to ip.h Laurent Vivier
2024-02-05  6:16   ` David Gibson
2024-02-07 10:40   ` Stefano Brivio
2024-02-07 23:43     ` David Gibson
2024-02-02 14:11 ` [PATCH 07/24] ip: introduce functions to compute the header part checksum for TCP/UDP Laurent Vivier
2024-02-05  6:20   ` David Gibson
2024-02-07 10:41   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 08/24] tcp: extract buffer management from tcp_send_flag() Laurent Vivier
2024-02-06  0:24   ` David Gibson
2024-02-08 16:57   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 09/24] tcp: extract buffer management from tcp_conn_tap_mss() Laurent Vivier
2024-02-06  0:47   ` David Gibson
2024-02-08 16:59   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 10/24] tcp: rename functions that manage buffers Laurent Vivier
2024-02-06  1:48   ` David Gibson
2024-02-08 17:10     ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 11/24] tcp: move buffers management functions to their own file Laurent Vivier
2024-02-02 14:11 ` [PATCH 12/24] tap: make tap_update_mac() generic Laurent Vivier
2024-02-06  1:49   ` David Gibson
2024-02-08 17:10     ` Stefano Brivio
2024-02-09  5:02       ` David Gibson
2024-02-02 14:11 ` [PATCH 13/24] tap: export pool_flush()/tapX_handler()/packet_add() Laurent Vivier
2024-02-02 14:29   ` Laurent Vivier
2024-02-06  1:52   ` David Gibson
2024-02-11 23:15   ` Stefano Brivio
2024-02-12  2:22     ` David Gibson
2024-02-02 14:11 ` [PATCH 14/24] udp: move udpX_l2_buf_t and udpX_l2_mh_sock out of udp_update_hdrX() Laurent Vivier
2024-02-06  1:59   ` David Gibson
2024-02-11 23:16   ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 15/24] udp: rename udp_sock_handler() to udp_buf_sock_handler() Laurent Vivier
2024-02-06  2:14   ` David Gibson
2024-02-11 23:17     ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 16/24] packet: replace struct desc by struct iovec Laurent Vivier
2024-02-06  2:25   ` David Gibson
2024-02-11 23:18     ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 17/24] vhost-user: compare mode MODE_PASTA and not MODE_PASST Laurent Vivier
2024-02-06  2:29   ` David Gibson
2024-02-02 14:11 ` [PATCH 18/24] vhost-user: introduce virtio API Laurent Vivier
2024-02-06  3:51   ` David Gibson
2024-02-11 23:18     ` Stefano Brivio
2024-02-12  2:26       ` David Gibson
2024-02-02 14:11 ` [PATCH 19/24] vhost-user: introduce vhost-user API Laurent Vivier
2024-02-07  2:13   ` David Gibson
2024-02-02 14:11 ` [PATCH 20/24] vhost-user: add vhost-user Laurent Vivier
2024-02-07  2:40   ` David Gibson
2024-02-11 23:19     ` Stefano Brivio
2024-02-12  2:47       ` David Gibson
2024-02-13 15:22         ` Stefano Brivio
2024-02-14  2:05           ` David Gibson
2024-02-11 23:19   ` Stefano Brivio
2024-02-12  2:49     ` David Gibson
2024-02-12 10:02       ` Laurent Vivier
2024-02-12 16:56         ` Stefano Brivio
2024-02-02 14:11 ` [PATCH 21/24] vhost-user: use guest buffer directly in vu_handle_tx() Laurent Vivier
2024-02-09  4:26   ` David Gibson
2024-02-02 14:11 ` Laurent Vivier [this message]
2024-02-09  4:57   ` [PATCH 22/24] tcp: vhost-user RX nocopy David Gibson
2024-02-02 14:11 ` [PATCH 23/24] udp: " Laurent Vivier
2024-02-09  5:00   ` David Gibson
2024-02-02 14:11 ` [PATCH 24/24] vhost-user: remove tap_send_frames_vu() Laurent Vivier
2024-02-09  5:01   ` David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240202141151.3762941-23-lvivier@redhat.com \
    --to=lvivier@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).