From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124]) by passt.top (Postfix) with ESMTP id D59665A027B for ; Fri, 2 Feb 2024 15:11:58 +0100 (CET) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com; s=mimecast20190719; t=1706883117; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=Q0POsAv3N0NUFWmxDh98NrYbA113NkxYiSswSgO9JJs=; b=iHuZFWOiB4gCN5FMi0B00CsYRhCP1EGOR5J1tn1bp9XkhgOS/khUUTT3NGDdSAfeoQgGPn yjbXGzRklSPxY4km2mjeH5fNNzB3HkZRaAjufUeXneqgK7QrY9gBP4HNQKOKr/CyChblXl gukWteXyHwedVniMQqmhxRaNVk90ZWc= Received: from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id us-mta-528-T8oy0hJmNm6IxhZlJ9cKmA-1; Fri, 02 Feb 2024 09:11:56 -0500 X-MC-Unique: T8oy0hJmNm6IxhZlJ9cKmA-1 Received: from smtp.corp.redhat.com (int-mx01.intmail.prod.int.rdu2.redhat.com [10.11.54.1]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256) (No client certificate requested) by mimecast-mx02.redhat.com (Postfix) with ESMTPS id D1D4A85A58B for ; Fri, 2 Feb 2024 14:11:55 +0000 (UTC) Received: from virtlab218.virt.lab.eng.bos.redhat.com (virtlab218.virt.lab.eng.bos.redhat.com [10.19.152.190]) by smtp.corp.redhat.com (Postfix) with ESMTP id B56733C2E; Fri, 2 Feb 2024 14:11:55 +0000 (UTC) From: Laurent Vivier To: passt-dev@passt.top Subject: [PATCH 22/24] tcp: vhost-user RX nocopy Date: Fri, 2 Feb 2024 15:11:49 +0100 Message-ID: <20240202141151.3762941-23-lvivier@redhat.com> In-Reply-To: <20240202141151.3762941-1-lvivier@redhat.com> References: <20240202141151.3762941-1-lvivier@redhat.com> MIME-Version: 1.0 X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.1 X-Mimecast-Spam-Score: 0 X-Mimecast-Originator: redhat.com Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset="US-ASCII"; x-default=true Message-ID-Hash: SP34BVNC6WT5R5A3I2UAW3JNZQ6USSHA X-Message-ID-Hash: SP34BVNC6WT5R5A3I2UAW3JNZQ6USSHA X-MailFrom: lvivier@redhat.com X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; emergency; loop; banned-address; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header CC: Laurent Vivier X-Mailman-Version: 3.3.8 Precedence: list List-Id: Development discussion and patches for passt Archived-At: Archived-At: List-Archive: List-Archive: List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: Signed-off-by: Laurent Vivier --- Makefile | 6 +- tcp.c | 66 +++++--- tcp_vu.c | 447 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ tcp_vu.h | 10 ++ 4 files changed, 502 insertions(+), 27 deletions(-) create mode 100644 tcp_vu.c create mode 100644 tcp_vu.h diff --git a/Makefile b/Makefile index 2016b071ddf2..f7a403d19b61 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c icmp.c \ igmp.c isolation.c lineread.c log.c mld.c ndp.c netlink.c packet.c \ passt.c pasta.c pcap.c pif.c port_fwd.c tap.c tcp.c tcp_splice.c \ - tcp_buf.c udp.c util.c iov.c ip.c virtio.c vhost_user.c + tcp_buf.c tcp_vu.c udp.c util.c iov.c ip.c virtio.c vhost_user.c QRAP_SRCS = qrap.c SRCS = $(PASST_SRCS) $(QRAP_SRCS) @@ -56,8 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h \ flow_table.h icmp.h inany.h isolation.h lineread.h log.h ndp.h \ netlink.h packet.h passt.h pasta.h pcap.h pif.h port_fwd.h siphash.h \ - tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_internal.h udp.h \ - util.h iov.h ip.h virtio.h vhost_user.h + tap.h tcp.h tcp_conn.h tcp_splice.h tcp_buf.h tcp_vu.h tcp_internal.h \ + udp.h util.h iov.h ip.h virtio.h vhost_user.h HEADERS = $(PASST_HEADERS) seccomp.h C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; diff --git a/tcp.c b/tcp.c index b6aca9f37f19..e829e12fe7c2 100644 --- a/tcp.c +++ b/tcp.c @@ -302,6 +302,7 @@ #include "flow_table.h" #include "tcp_internal.h" #include "tcp_buf.h" +#include "tcp_vu.h" /* Sides of a flow as we use them in "tap" connections */ #define SOCKSIDE 0 @@ -1034,7 +1035,7 @@ size_t ipv4_fill_headers(const struct ctx *c, tcp_set_tcp_header(th, conn, seq); th->check = 0; - if (c->mode != MODE_VU || *c->pcap) + if (c->mode != MODE_VU) th->check = tcp_update_check_tcp4(iph); return ip_len; @@ -1072,7 +1073,7 @@ size_t ipv6_fill_headers(const struct ctx *c, tcp_set_tcp_header(th, conn, seq); th->check = 0; - if (c->mode != MODE_VU || *c->pcap) + if (c->mode != MODE_VU) th->check = tcp_update_check_tcp6(ip6h); ip6h->hop_limit = 255; @@ -1302,6 +1303,12 @@ int do_tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags, return 1; } +int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + if (c->mode == MODE_VU) + return tcp_vu_send_flag(c, conn, flags); + return tcp_buf_send_flag(c, conn, flags); +} /** * tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket @@ -1313,7 +1320,7 @@ void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn) if (conn->events == CLOSED) return; - if (!tcp_buf_send_flag(c, conn, RST)) + if (!tcp_send_flag(c, conn, RST)) conn_event(c, conn, CLOSED); } @@ -1430,7 +1437,8 @@ int tcp_conn_new_sock(const struct ctx *c, sa_family_t af) * * Return: clamped MSS value */ -static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn, +static uint16_t tcp_conn_tap_mss(const struct ctx *c, + const struct tcp_tap_conn *conn, const char *opts, size_t optlen) { unsigned int mss; @@ -1441,7 +1449,10 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn, else mss = ret; - mss = MIN(tcp_buf_conn_tap_mss(conn), mss); + if (c->mode == MODE_VU) + mss = MIN(tcp_vu_conn_tap_mss(conn), mss); + else + mss = MIN(tcp_buf_conn_tap_mss(conn), mss); return MIN(mss, USHRT_MAX); } @@ -1568,7 +1579,7 @@ static void tcp_conn_from_tap(struct ctx *c, conn->wnd_to_tap = WINDOW_DEFAULT; - mss = tcp_conn_tap_mss(conn, opts, optlen); + mss = tcp_conn_tap_mss(c, conn, opts, optlen); if (setsockopt(s, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss))) flow_trace(conn, "failed to set TCP_MAXSEG on socket %i", s); MSS_SET(conn, mss); @@ -1625,7 +1636,7 @@ static void tcp_conn_from_tap(struct ctx *c, } else { tcp_get_sndbuf(conn); - if (tcp_buf_send_flag(c, conn, SYN | ACK)) + if (tcp_send_flag(c, conn, SYN | ACK)) return; conn_event(c, conn, TAP_SYN_ACK_SENT); @@ -1673,6 +1684,13 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) return 0; } +static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +{ + if (c->mode == MODE_VU) + return tcp_vu_data_from_sock(c, conn); + + return tcp_buf_data_from_sock(c, conn); +} /** * tcp_data_from_tap() - tap/guest data for established connection @@ -1806,7 +1824,7 @@ static int tcp_data_from_tap(struct ctx *c, struct tcp_tap_conn *conn, max_ack_seq, conn->seq_to_tap); conn->seq_ack_from_tap = max_ack_seq; conn->seq_to_tap = max_ack_seq; - tcp_buf_data_from_sock(c, conn); + tcp_data_from_sock(c, conn); } if (!iov_i) @@ -1822,14 +1840,14 @@ eintr: * Then swiftly looked away and left. */ conn->seq_from_tap = seq_from_tap; - tcp_buf_send_flag(c, conn, ACK); + tcp_send_flag(c, conn, ACK); } if (errno == EINTR) goto eintr; if (errno == EAGAIN || errno == EWOULDBLOCK) { - tcp_buf_send_flag(c, conn, ACK_IF_NEEDED); + tcp_send_flag(c, conn, ACK_IF_NEEDED); return p->count - idx; } @@ -1839,7 +1857,7 @@ eintr: if (n < (int)(seq_from_tap - conn->seq_from_tap)) { partial_send = 1; conn->seq_from_tap += n; - tcp_buf_send_flag(c, conn, ACK_IF_NEEDED); + tcp_send_flag(c, conn, ACK_IF_NEEDED); } else { conn->seq_from_tap += n; } @@ -1852,7 +1870,7 @@ out: */ if (conn->seq_dup_ack_approx != (conn->seq_from_tap & 0xff)) { conn->seq_dup_ack_approx = conn->seq_from_tap & 0xff; - tcp_buf_send_flag(c, conn, DUP_ACK); + tcp_send_flag(c, conn, DUP_ACK); } return p->count - idx; } @@ -1866,7 +1884,7 @@ out: conn_event(c, conn, TAP_FIN_RCVD); } else { - tcp_buf_send_flag(c, conn, ACK_IF_NEEDED); + tcp_send_flag(c, conn, ACK_IF_NEEDED); } return p->count - idx; @@ -1891,7 +1909,7 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, if (!(conn->wnd_from_tap >>= conn->ws_from_tap)) conn->wnd_from_tap = 1; - MSS_SET(conn, tcp_conn_tap_mss(conn, opts, optlen)); + MSS_SET(conn, tcp_conn_tap_mss(c, conn, opts, optlen)); conn->seq_init_from_tap = ntohl(th->seq) + 1; conn->seq_from_tap = conn->seq_init_from_tap; @@ -1902,8 +1920,8 @@ static void tcp_conn_from_sock_finish(struct ctx *c, struct tcp_tap_conn *conn, /* The client might have sent data already, which we didn't * dequeue waiting for SYN,ACK from tap -- check now. */ - tcp_buf_data_from_sock(c, conn); - tcp_buf_send_flag(c, conn, ACK); + tcp_data_from_sock(c, conn); + tcp_send_flag(c, conn, ACK); } /** @@ -1983,7 +2001,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, conn->seq_from_tap++; shutdown(conn->sock, SHUT_WR); - tcp_buf_send_flag(c, conn, ACK); + tcp_send_flag(c, conn, ACK); conn_event(c, conn, SOCK_FIN_SENT); return 1; @@ -1994,7 +2012,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, tcp_tap_window_update(conn, ntohs(th->window)); - tcp_buf_data_from_sock(c, conn); + tcp_data_from_sock(c, conn); if (p->count - idx == 1) return 1; @@ -2024,7 +2042,7 @@ int tcp_tap_handler(struct ctx *c, uint8_t pif, int af, if ((conn->events & TAP_FIN_RCVD) && !(conn->events & SOCK_FIN_SENT)) { shutdown(conn->sock, SHUT_WR); conn_event(c, conn, SOCK_FIN_SENT); - tcp_buf_send_flag(c, conn, ACK); + tcp_send_flag(c, conn, ACK); ack_due = 0; } @@ -2058,7 +2076,7 @@ static void tcp_connect_finish(struct ctx *c, struct tcp_tap_conn *conn) return; } - if (tcp_buf_send_flag(c, conn, SYN | ACK)) + if (tcp_send_flag(c, conn, SYN | ACK)) return; conn_event(c, conn, TAP_SYN_ACK_SENT); @@ -2126,7 +2144,7 @@ static void tcp_tap_conn_from_sock(struct ctx *c, conn->wnd_from_tap = WINDOW_DEFAULT; - tcp_buf_send_flag(c, conn, SYN); + tcp_send_flag(c, conn, SYN); conn_flag(c, conn, ACK_FROM_TAP_DUE); tcp_get_sndbuf(conn); @@ -2190,7 +2208,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) return; if (conn->flags & ACK_TO_TAP_DUE) { - tcp_buf_send_flag(c, conn, ACK_IF_NEEDED); + tcp_send_flag(c, conn, ACK_IF_NEEDED); tcp_timer_ctl(c, conn); } else if (conn->flags & ACK_FROM_TAP_DUE) { if (!(conn->events & ESTABLISHED)) { @@ -2206,7 +2224,7 @@ void tcp_timer_handler(struct ctx *c, union epoll_ref ref) flow_dbg(conn, "ACK timeout, retry"); conn->retrans++; conn->seq_to_tap = conn->seq_ack_from_tap; - tcp_buf_data_from_sock(c, conn); + tcp_data_from_sock(c, conn); tcp_timer_ctl(c, conn); } } else { @@ -2261,7 +2279,7 @@ void tcp_sock_handler(struct ctx *c, union epoll_ref ref, uint32_t events) conn_event(c, conn, SOCK_FIN_RCVD); if (events & EPOLLIN) - tcp_buf_data_from_sock(c, conn); + tcp_data_from_sock(c, conn); if (events & EPOLLOUT) tcp_update_seqack_wnd(c, conn, 0, NULL); diff --git a/tcp_vu.c b/tcp_vu.c new file mode 100644 index 000000000000..ed59b21cabdc --- /dev/null +++ b/tcp_vu.c @@ -0,0 +1,447 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include + +#include + +#include + +#include +#include + +#include "util.h" +#include "ip.h" +#include "passt.h" +#include "siphash.h" +#include "inany.h" +#include "vhost_user.h" +#include "tcp.h" +#include "pcap.h" +#include "flow.h" +#include "tcp_conn.h" +#include "flow_table.h" +#include "tcp_vu.h" +#include "tcp_internal.h" +#include "checksum.h" + +#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr)) +#define CONN_V6(conn) (!CONN_V4(conn)) + +/* vhost-user */ +static const struct virtio_net_hdr vu_header = { + .flags = VIRTIO_NET_HDR_F_DATA_VALID, + .gso_type = VIRTIO_NET_HDR_GSO_NONE, +}; + +static unsigned char buffer[65536]; +static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE]; +static unsigned int indexes [VIRTQUEUE_MAX_SIZE]; + +uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn) +{ + (void)conn; + return USHRT_MAX; +} + +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) +{ + VuDev *vdev = (VuDev *)&c->vdev; + VuVirtqElement *elem; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + struct virtio_net_hdr_mrg_rxbuf *vh; + size_t tlen, vnet_hdrlen, ip_len, optlen = 0; + struct ethhdr *eh; + int ret; + int nb_ack; + + elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer); + if (!elem) + return 0; + + if (elem->in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + vu_queue_rewind(vdev, vq, 1); + return 0; + } + + /* Options: MSS, NOP and window scale (8 bytes) */ + if (flags & SYN) + optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN; + + vh = elem->in_sg[0].iov_base; + + vh->hdr = vu_header; + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + vh->num_buffers = htole16(1); + } else { + vnet_hdrlen = sizeof(struct virtio_net_hdr); + } + eh = (struct ethhdr *)((char *)elem->in_sg[0].iov_base + vnet_hdrlen); + + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + if (CONN_V4(conn)) { + struct iphdr *iph = (struct iphdr *)(eh + 1); + struct tcphdr *th = (struct tcphdr *)(iph + 1); + char *data = (char *)(th + 1); + + eh->h_proto = htons(ETH_P_IP); + + *th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; + + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + + ret = do_tcp_send_flag(c, conn, flags, th, data, optlen); + if (ret <= 0) { + vu_queue_rewind(vdev, vq, 1); + return ret; + } + + ip_len = ipv4_fill_headers(c, conn, iph, optlen, NULL, + conn->seq_to_tap); + + tlen = ip_len + sizeof(struct ethhdr); + + if (*c->pcap) { + uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP); + + th->check = csum(th, optlen + sizeof(struct tcphdr), sum); + } + } else { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); + struct tcphdr *th = (struct tcphdr *)(ip6h + 1); + char *data = (char *)(th + 1); + + eh->h_proto = htons(ETH_P_IPV6); + + *th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; + + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + + ret = do_tcp_send_flag(c, conn, flags, th, data, optlen); + if (ret <= 0) { + vu_queue_rewind(vdev, vq, 1); + return ret; + } + + ip_len = ipv6_fill_headers(c, conn, ip6h, optlen, + conn->seq_to_tap); + + tlen = ip_len + sizeof(struct ethhdr); + + if (*c->pcap) { + uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP); + + th->check = csum(th, optlen + sizeof(struct tcphdr), sum); + } + } + + pcap((void *)eh, tlen); + + tlen += vnet_hdrlen; + vu_queue_fill(vdev, vq, elem, tlen, 0); + nb_ack = 1; + + if (flags & DUP_ACK) { + elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer); + if (elem) { + if (elem->in_num < 1 || elem->in_sg[0].iov_len < tlen) { + vu_queue_rewind(vdev, vq, 1); + } else { + memcpy(elem->in_sg[0].iov_base, vh, tlen); + nb_ack++; + } + } + } + + vu_queue_flush(vdev, vq, nb_ack); + vu_queue_notify(vdev, vq); + + return 0; +} + +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) +{ + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; + uint32_t already_sent; + VuDev *vdev = (VuDev *)&c->vdev; + VuVirtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; + int s = conn->sock, v4 = CONN_V4(conn); + int i, ret = 0, iov_count, iov_used; + struct msghdr mh_sock = { 0 }; + size_t l2_hdrlen, vnet_hdrlen, fillsize; + ssize_t len; + uint16_t *check; + uint16_t mss = MSS_GET(conn); + int num_buffers; + int segment_size; + struct iovec *first; + bool has_mrg_rxbuf; + + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { + err("Got packet, but no available descriptors on RX virtq."); + return 0; + } + + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; + + if (SEQ_LT(already_sent, 0)) { + /* RFC 761, section 2.1. */ + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", + conn->seq_ack_from_tap, conn->seq_to_tap); + conn->seq_to_tap = conn->seq_ack_from_tap; + already_sent = 0; + } + + if (!wnd_scaled || already_sent >= wnd_scaled) { + conn_flag(c, conn, STALLED); + conn_flag(c, conn, ACK_FROM_TAP_DUE); + return 0; + } + + /* Set up buffer descriptors we'll fill completely and partially. */ + + fillsize = wnd_scaled; + + iov_vu[0].iov_base = tcp_buf_discard; + iov_vu[0].iov_len = already_sent; + fillsize -= already_sent; + + has_mrg_rxbuf = vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF); + if (has_mrg_rxbuf) { + vnet_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + vnet_hdrlen = sizeof(struct virtio_net_hdr); + } + l2_hdrlen = vnet_hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr); + if (v4) { + l2_hdrlen += sizeof(struct iphdr); + } else { + l2_hdrlen += sizeof(struct ipv6hdr); + } + + iov_count = 0; + segment_size = 0; + while (fillsize > 0 && iov_count < VIRTQUEUE_MAX_SIZE - 1) { + VuVirtqElement *elem; + + elem = vu_queue_pop(vdev, vq, sizeof(VuVirtqElement), buffer); + if (!elem) + break; + + if (elem->in_num < 1) { + err("virtio-net receive queue contains no in buffers"); + goto err; + } + + ASSERT(elem->in_num == 1); + ASSERT(elem->in_sg[0].iov_len >= l2_hdrlen); + + indexes[iov_count] = elem->index; + + if (segment_size == 0) { + iov_vu[iov_count + 1].iov_base = + (char *)elem->in_sg[0].iov_base + l2_hdrlen; + iov_vu[iov_count + 1].iov_len = + elem->in_sg[0].iov_len - l2_hdrlen; + } else { + iov_vu[iov_count + 1].iov_base = elem->in_sg[0].iov_base; + iov_vu[iov_count + 1].iov_len = elem->in_sg[0].iov_len; + } + + if (iov_vu[iov_count + 1].iov_len > fillsize) + iov_vu[iov_count + 1].iov_len = fillsize; + + segment_size += iov_vu[iov_count + 1].iov_len; + if (!has_mrg_rxbuf) { + segment_size = 0; + } else if (segment_size >= mss) { + iov_vu[iov_count + 1].iov_len -= segment_size - mss; + segment_size = 0; + } + fillsize -= iov_vu[iov_count + 1].iov_len; + + iov_count++; + } + if (iov_count == 0) + return 0; + + mh_sock.msg_iov = iov_vu; + mh_sock.msg_iovlen = iov_count + 1; + + do + len = recvmsg(s, &mh_sock, MSG_PEEK); + while (len < 0 && errno == EINTR); + + if (len < 0) + goto err; + + if (!len) { + vu_queue_rewind(vdev, vq, iov_count); + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { + if ((ret = tcp_vu_send_flag(c, conn, FIN | ACK))) { + tcp_rst(c, conn); + return ret; + } + + conn_event(c, conn, TAP_FIN_SENT); + } + + return 0; + } + + len -= already_sent; + if (len <= 0) { + conn_flag(c, conn, STALLED); + vu_queue_rewind(vdev, vq, iov_count); + return 0; + } + + conn_flag(c, conn, ~STALLED); + + /* Likely, some new data was acked too. */ + tcp_update_seqack_wnd(c, conn, 0, NULL); + + /* initialize headers */ + iov_used = 0; + num_buffers = 0; + check = NULL; + segment_size = 0; + for (i = 0; i < iov_count && len; i++) { + + if (segment_size == 0) + first = &iov_vu[i + 1]; + + if (iov_vu[i + 1].iov_len > (size_t)len) + iov_vu[i + 1].iov_len = len; + + len -= iov_vu[i + 1].iov_len; + iov_used++; + + segment_size += iov_vu[i + 1].iov_len; + num_buffers++; + + if (segment_size >= mss || len == 0 || + i + 1 == iov_count || !has_mrg_rxbuf) { + + struct ethhdr *eh; + struct virtio_net_hdr_mrg_rxbuf *vh; + char *base = (char *)first->iov_base - l2_hdrlen; + size_t size = first->iov_len + l2_hdrlen; + + vh = (struct virtio_net_hdr_mrg_rxbuf *)base; + + vh->hdr = vu_header; + if (has_mrg_rxbuf) + vh->num_buffers = htole16(num_buffers); + + eh = (struct ethhdr *)((char *)base + vnet_hdrlen); + + memcpy(eh->h_dest, c->mac_guest, sizeof(eh->h_dest)); + memcpy(eh->h_source, c->mac, sizeof(eh->h_source)); + + /* initialize header */ + if (v4) { + struct iphdr *iph = (struct iphdr *)(eh + 1); + struct tcphdr *th = (struct tcphdr *)(iph + 1); + + eh->h_proto = htons(ETH_P_IP); + + *th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; + + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); + + ipv4_fill_headers(c, conn, iph, segment_size, + len ? check : NULL, conn->seq_to_tap); + + if (*c->pcap) { + uint32_t sum = proto_ipv4_header_checksum(iph, IPPROTO_TCP); + + first->iov_base = th; + first->iov_len = size - l2_hdrlen + sizeof(*th); + + th->check = csum_iov(first, num_buffers, sum); + } + + check = &iph->check; + } else { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(eh + 1); + struct tcphdr *th = (struct tcphdr *)(ip6h + 1); + + eh->h_proto = htons(ETH_P_IPV6); + + *th = (struct tcphdr){ + .doff = sizeof(struct tcphdr) / 4, + .ack = 1 + }; + + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); + + ipv6_fill_headers(c, conn, ip6h, segment_size, + conn->seq_to_tap); + if (*c->pcap) { + uint32_t sum = proto_ipv6_header_checksum(ip6h, IPPROTO_TCP); + + first->iov_base = th; + first->iov_len = size - l2_hdrlen + sizeof(*th); + + th->check = csum_iov(first, num_buffers, sum); + } + } + + /* set iov for pcap logging */ + first->iov_base = eh; + first->iov_len = size - vnet_hdrlen; + + pcap_iov(first, num_buffers); + + /* set iov_len for vu_queue_fill_by_index(); */ + + first->iov_base = base; + first->iov_len = size; + + conn->seq_to_tap += segment_size; + + segment_size = 0; + num_buffers = 0; + } + } + + /* release unused buffers */ + vu_queue_rewind(vdev, vq, iov_count - iov_used); + + /* send packets */ + for (i = 0; i < iov_used; i++) { + vu_queue_fill_by_index(vdev, vq, indexes[i], + iov_vu[i + 1].iov_len, i); + } + + vu_queue_flush(vdev, vq, iov_used); + vu_queue_notify(vdev, vq); + + conn_flag(c, conn, ACK_FROM_TAP_DUE); + + return 0; +err: + vu_queue_rewind(vdev, vq, iov_count); + + if (errno != EAGAIN && errno != EWOULDBLOCK) { + ret = -errno; + tcp_rst(c, conn); + } + + return ret; +} diff --git a/tcp_vu.h b/tcp_vu.h new file mode 100644 index 000000000000..8045a6e3edb8 --- /dev/null +++ b/tcp_vu.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#ifndef TCP_VU_H +#define TCP_VU_H + +uint16_t tcp_vu_conn_tap_mss(const struct tcp_tap_conn *conn); +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); + +#endif /*TCP_VU_H */ -- 2.42.0