// SPDX-License-Identifier: GPL-2.0-or-later /* tcp_vu.c - TCP L2 vhost-user management functions * * Copyright Red Hat * Author: Laurent Vivier */ #include #include #include #include #include #include #include #include "util.h" #include "ip.h" #include "passt.h" #include "siphash.h" #include "inany.h" #include "vhost_user.h" #include "tcp.h" #include "pcap.h" #include "flow.h" #include "tcp_conn.h" #include "flow_table.h" #include "tcp_vu.h" #include "tcp_internal.h" #include "checksum.h" #include "vu_common.h" /** * struct tcp_payload_t - TCP header and data to send segments with payload * @th: TCP header * @data: TCP data */ struct tcp_payload_t { struct tcphdr th; uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; }; /** * struct tcp_flags_t - TCP header and data to send zero-length * segments (flags) * @th: TCP header * @opts TCP options */ struct tcp_flags_t { struct tcphdr th; char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; }; /* vhost-user */ static const struct virtio_net_hdr vu_header = { .flags = VIRTIO_NET_HDR_F_DATA_VALID, .gso_type = VIRTIO_NET_HDR_GSO_NONE, }; static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE]; static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; /** * tcp_vu_l2_hdrlen() - return the size of the header in level 2 frame (TCP) * @vdev: vhost-user device * @v6: Set for IPv6 packet * * Return: Return the size of the header */ static size_t tcp_vu_l2_hdrlen(const struct vu_dev *vdev, bool v6) { size_t l2_hdrlen; l2_hdrlen = vdev->hdrlen + sizeof(struct ethhdr) + sizeof(struct tcphdr); if (v6) l2_hdrlen += sizeof(struct ipv6hdr); else l2_hdrlen += sizeof(struct iphdr); return l2_hdrlen; } /** * tcp_vu_pcap() - Capture a single frame to pcap file (TCP) * @c: Execution context * @tapside: Address information for one side of the flow * @iov: Pointer to the array of IO vectors * @iov_used: Length of the array * @l4len: IPv4 Payload length */ static void tcp_vu_pcap(const struct ctx *c, const struct flowside *tapside, struct iovec *iov, int iov_used, size_t l4len) { const struct in_addr *src = inany_v4(&tapside->oaddr); const struct in_addr *dst = inany_v4(&tapside->eaddr); const struct vu_dev *vdev = c->vdev; char *base = iov[0].iov_base; size_t size = iov[0].iov_len; struct tcp_payload_t *bp; uint32_t sum; if (!*c->pcap) return; if (src && dst) { bp = vu_payloadv4(vdev, base); sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, *src, *dst); } else { bp = vu_payloadv6(vdev, base); sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, &tapside->oaddr.a6, &tapside->eaddr.a6); } iov[0].iov_base = &bp->th; iov[0].iov_len = size - ((char *)iov[0].iov_base - base); bp->th.check = 0; bp->th.check = csum_iov(iov, iov_used, sum); /* set iov for pcap logging */ iov[0].iov_base = base + vdev->hdrlen; iov[0].iov_len = size - vdev->hdrlen; pcap_iov(iov, iov_used); /* restore iov[0] */ iov[0].iov_base = base; iov[0].iov_len = size; } /** * tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload) * @c: Execution context * @conn: Connection pointer * @flags: TCP flags: if not set, send segment only if ACK is due * * Return: negative error code on connection reset, 0 otherwise */ int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) { struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; const struct flowside *tapside = TAPFLOW(conn); struct virtio_net_hdr_mrg_rxbuf *vh; struct iovec l2_iov[TCP_NUM_IOVS]; size_t l2len, l4len, optlen; struct iovec in_sg; struct ethhdr *eh; int nb_ack; int ret; elem[0].out_num = 0; elem[0].out_sg = NULL; elem[0].in_num = 1; elem[0].in_sg = &in_sg; ret = vu_queue_pop(vdev, vq, &elem[0]); if (ret < 0) return 0; if (elem[0].in_num < 1) { debug("virtio-net receive queue contains no in buffers"); vu_queue_rewind(vq, 1); return 0; } vh = elem[0].in_sg[0].iov_base; vh->hdr = vu_header; if (vdev->hdrlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) vh->num_buffers = htole16(1); l2_iov[TCP_IOV_TAP].iov_base = NULL; l2_iov[TCP_IOV_TAP].iov_len = 0; l2_iov[TCP_IOV_ETH].iov_base = (char *)elem[0].in_sg[0].iov_base + vdev->hdrlen; l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); eh = l2_iov[TCP_IOV_ETH].iov_base; memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); if (CONN_V4(conn)) { struct tcp_flags_t *payload; struct iphdr *iph; uint32_t seq; l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + l2_iov[TCP_IOV_ETH].iov_len; l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + l2_iov[TCP_IOV_IP].iov_len; eh->h_proto = htons(ETH_P_IP); iph = l2_iov[TCP_IOV_IP].iov_base; *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; payload->th = (struct tcphdr){ .doff = offsetof(struct tcp_flags_t, opts) / 4, .ack = 1 }; seq = conn->seq_to_tap; ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); if (ret <= 0) { vu_queue_rewind(vq, 1); return ret; } l4len = tcp_l2_buf_fill_headers(conn, l2_iov, optlen, NULL, seq, true); /* keep the following assignment for clarity */ /* cppcheck-suppress unreadVariable */ l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; l2len = l4len + sizeof(*iph) + sizeof(struct ethhdr); } else { struct tcp_flags_t *payload; struct ipv6hdr *ip6h; uint32_t seq; l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + l2_iov[TCP_IOV_ETH].iov_len; l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + l2_iov[TCP_IOV_IP].iov_len; eh->h_proto = htons(ETH_P_IPV6); ip6h = l2_iov[TCP_IOV_IP].iov_base; *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; payload->th = (struct tcphdr){ .doff = offsetof(struct tcp_flags_t, opts) / 4, .ack = 1 }; seq = conn->seq_to_tap; ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); if (ret <= 0) { vu_queue_rewind(vq, 1); return ret; } l4len = tcp_l2_buf_fill_headers(conn, l2_iov, optlen, NULL, seq, true); /* keep the following assignment for clarity */ /* cppcheck-suppress unreadVariable */ l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; l2len = l4len + sizeof(*ip6h) + sizeof(struct ethhdr); } l2len += vdev->hdrlen; ASSERT(l2len <= elem[0].in_sg[0].iov_len); elem[0].in_sg[0].iov_len = l2len; tcp_vu_pcap(c, tapside, &elem[0].in_sg[0], 1, l4len); vu_queue_fill(vq, &elem[0], l2len, 0); nb_ack = 1; if (flags & DUP_ACK) { struct iovec in_sg_dup; elem[1].out_num = 0; elem[1].out_sg = NULL; elem[1].in_num = 1; elem[1].in_sg = &in_sg_dup; ret = vu_queue_pop(vdev, vq, &elem[1]); if (ret == 0) { if (elem[1].in_num < 1 || elem[1].in_sg[0].iov_len < l2len) { vu_queue_rewind(vq, 1); } else { memcpy(elem[1].in_sg[0].iov_base, vh, l2len); nb_ack++; tcp_vu_pcap(c, tapside, &elem[1].in_sg[0], 1, l4len); vu_queue_fill(vq, &elem[1], l2len, 1); } } } vu_queue_flush(vq, nb_ack); vu_queue_notify(vdev, vq); return 0; } /** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers * @c: Execution context * @conn: Connection pointer * @v4: Set for IPv4 connections * @fillsize: Number of bytes we can receive * @datalen: Size of received data (output) * * Return: Number of iov entries used to store the data */ static ssize_t tcp_vu_sock_recv(struct ctx *c, struct tcp_tap_conn *conn, bool v4, size_t fillsize, ssize_t *data_len) { struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; static struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; struct msghdr mh_sock = { 0 }; uint16_t mss = MSS_GET(conn); static int in_sg_count; int s = conn->sock; size_t l2_hdrlen; int segment_size; int iov_cnt; ssize_t ret; l2_hdrlen = tcp_vu_l2_hdrlen(vdev, !v4); iov_cnt = 0; in_sg_count = 0; segment_size = 0; *data_len = 0; while (fillsize > 0 && iov_cnt < VIRTQUEUE_MAX_SIZE - 1 && in_sg_count < ARRAY_SIZE(in_sg)) { elem[iov_cnt].out_num = 0; elem[iov_cnt].out_sg = NULL; elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count; elem[iov_cnt].in_sg = &in_sg[in_sg_count]; ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]); if (ret < 0) break; if (elem[iov_cnt].in_num < 1) { warn("virtio-net receive queue contains no in buffers"); break; } in_sg_count += elem[iov_cnt].in_num; ASSERT(elem[iov_cnt].in_num == 1); ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen); if (segment_size == 0) { iov_vu[iov_cnt + 1].iov_base = (char *)elem[iov_cnt].in_sg[0].iov_base + l2_hdrlen; iov_vu[iov_cnt + 1].iov_len = elem[iov_cnt].in_sg[0].iov_len - l2_hdrlen; } else { iov_vu[iov_cnt + 1].iov_base = elem[iov_cnt].in_sg[0].iov_base; iov_vu[iov_cnt + 1].iov_len = elem[iov_cnt].in_sg[0].iov_len; } if (iov_vu[iov_cnt + 1].iov_len > fillsize) iov_vu[iov_cnt + 1].iov_len = fillsize; segment_size += iov_vu[iov_cnt + 1].iov_len; if (vdev->hdrlen != sizeof(struct virtio_net_hdr_mrg_rxbuf)) { segment_size = 0; } else if (segment_size >= mss) { iov_vu[iov_cnt + 1].iov_len -= segment_size - mss; segment_size = 0; } fillsize -= iov_vu[iov_cnt + 1].iov_len; iov_cnt++; } if (iov_cnt == 0) return 0; mh_sock.msg_iov = iov_vu; mh_sock.msg_iovlen = iov_cnt + 1; do ret = recvmsg(s, &mh_sock, MSG_PEEK); while (ret < 0 && errno == EINTR); if (ret < 0) { vu_queue_rewind(vq, iov_cnt); if (errno != EAGAIN && errno != EWOULDBLOCK) { ret = -errno; tcp_rst(c, conn); } return ret; } if (!ret) { vu_queue_rewind(vq, iov_cnt); if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { int retf = tcp_vu_send_flag(c, conn, FIN | ACK); if (retf) { tcp_rst(c, conn); return retf; } conn_event(c, conn, TAP_FIN_SENT); } return 0; } *data_len = ret; return iov_cnt; } /** * tcp_vu_prepare() - Prepare the packet header * @c: Execution context * @conn: Connection pointer * @first: Pointer to the array of IO vectors * @data_len: Packet data length * @check: Checksum, if already known * * Return: Level-4 length */ static size_t tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn, struct iovec *first, size_t data_len, const uint16_t **check) { const struct flowside *toside = TAPFLOW(conn); const struct vu_dev *vdev = c->vdev; struct iovec l2_iov[TCP_NUM_IOVS]; char *base = first->iov_base; struct ethhdr *eh; size_t l4len; /* we guess the first iovec provided by the guest can embed * all the headers needed by L2 frame */ l2_iov[TCP_IOV_TAP].iov_base = NULL; l2_iov[TCP_IOV_TAP].iov_len = 0; l2_iov[TCP_IOV_ETH].iov_base = base + vdev->hdrlen; l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); eh = l2_iov[TCP_IOV_ETH].iov_base; memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); /* initialize header */ if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { struct tcp_payload_t *payload; struct iphdr *iph; ASSERT(first[0].iov_len >= vdev->hdrlen + sizeof(struct ethhdr) + sizeof(struct iphdr) + sizeof(struct tcphdr)); l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + l2_iov[TCP_IOV_ETH].iov_len; l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + l2_iov[TCP_IOV_IP].iov_len; eh->h_proto = htons(ETH_P_IP); iph = l2_iov[TCP_IOV_IP].iov_base; *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; payload->th = (struct tcphdr){ .doff = offsetof(struct tcp_payload_t, data) / 4, .ack = 1 }; l4len = tcp_l2_buf_fill_headers(conn, l2_iov, data_len, *check, conn->seq_to_tap, true); /* keep the following assignment for clarity */ /* cppcheck-suppress unreadVariable */ l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; *check = &iph->check; } else { struct tcp_payload_t *payload; struct ipv6hdr *ip6h; ASSERT(first[0].iov_len >= vdev->hdrlen + sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)); l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + l2_iov[TCP_IOV_ETH].iov_len; l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + l2_iov[TCP_IOV_IP].iov_len; eh->h_proto = htons(ETH_P_IPV6); ip6h = l2_iov[TCP_IOV_IP].iov_base; *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; payload->th = (struct tcphdr){ .doff = offsetof(struct tcp_payload_t, data) / 4, .ack = 1 }; ; l4len = tcp_l2_buf_fill_headers(conn, l2_iov, data_len, NULL, conn->seq_to_tap, true); /* keep the following assignment for clarity */ /* cppcheck-suppress unreadVariable */ l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; } return l4len; } /** * tcp_vu_data_from_sock() - Handle new data from socket, queue to vhost-user, * in window * @c: Execution context * @conn: Connection pointer * * Return: Negative on connection reset, 0 otherwise */ int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) { uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; struct vu_dev *vdev = c->vdev; struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; const struct flowside *tapside = TAPFLOW(conn); uint16_t mss = MSS_GET(conn); size_t l2_hdrlen, fillsize; int i, iov_cnt, iov_used; int v4 = CONN_V4(conn); uint32_t already_sent = 0; const uint16_t *check; struct iovec *first; int segment_size; int num_buffers; ssize_t len; if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { flow_err(conn, "Got packet, but RX virtqueue not usable yet"); return 0; } already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; if (SEQ_LT(already_sent, 0)) { /* RFC 761, section 2.1. */ flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", conn->seq_ack_from_tap, conn->seq_to_tap); conn->seq_to_tap = conn->seq_ack_from_tap; already_sent = 0; } if (!wnd_scaled || already_sent >= wnd_scaled) { conn_flag(c, conn, STALLED); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; } /* Set up buffer descriptors we'll fill completely and partially. */ fillsize = wnd_scaled; if (peek_offset_cap) already_sent = 0; iov_vu[0].iov_base = tcp_buf_discard; iov_vu[0].iov_len = already_sent; fillsize -= already_sent; /* collect the buffers from vhost-user and fill them with the * data from the socket */ iov_cnt = tcp_vu_sock_recv(c, conn, v4, fillsize, &len); if (iov_cnt <= 0) return iov_cnt; len -= already_sent; if (len <= 0) { conn_flag(c, conn, STALLED); vu_queue_rewind(vq, iov_cnt); return 0; } conn_flag(c, conn, ~STALLED); /* Likely, some new data was acked too. */ tcp_update_seqack_wnd(c, conn, 0, NULL); /* initialize headers */ l2_hdrlen = tcp_vu_l2_hdrlen(vdev, !v4); iov_used = 0; num_buffers = 0; check = NULL; segment_size = 0; /* iov_vu is an array of buffers and the buffer size can be * smaller than the segment size we want to use but with * num_buffer we can merge several virtio iov buffers in one packet * we need only to set the packet headers in the first iov and * num_buffer to the number of iov entries */ for (i = 0; i < iov_cnt && len; i++) { if (segment_size == 0) first = &iov_vu[i + 1]; if (iov_vu[i + 1].iov_len > (size_t)len) iov_vu[i + 1].iov_len = len; len -= iov_vu[i + 1].iov_len; iov_used++; segment_size += iov_vu[i + 1].iov_len; num_buffers++; if (segment_size >= mss || len == 0 || i + 1 == iov_cnt || vdev->hdrlen != sizeof(struct virtio_net_hdr_mrg_rxbuf)) { struct virtio_net_hdr_mrg_rxbuf *vh; size_t l4len; if (i + 1 == iov_cnt) check = NULL; /* restore first iovec base: point to vnet header */ first->iov_base = (char *)first->iov_base - l2_hdrlen; first->iov_len = first->iov_len + l2_hdrlen; vh = first->iov_base; vh->hdr = vu_header; if (vdev->hdrlen == sizeof(struct virtio_net_hdr_mrg_rxbuf)) vh->num_buffers = htole16(num_buffers); l4len = tcp_vu_prepare(c, conn, first, segment_size, &check); tcp_vu_pcap(c, tapside, first, num_buffers, l4len); conn->seq_to_tap += segment_size; segment_size = 0; num_buffers = 0; } } /* release unused buffers */ vu_queue_rewind(vq, iov_cnt - iov_used); /* send packets */ vu_send_frame(vdev, vq, elem, &iov_vu[1], iov_used); conn_flag(c, conn, ACK_FROM_TAP_DUE); return 0; }