From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mail.ozlabs.org (mail.ozlabs.org [IPv6:2404:9400:2221:ea00::3]) by passt.top (Postfix) with ESMTPS id 324A35A0332 for ; Fri, 05 Jul 2024 04:07:44 +0200 (CEST) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gibson.dropbear.id.au; s=202312; t=1720145251; bh=Otv9hn4GeLvLoY2ql/6+qUGH0frWoUvVG4XD6P3MoUQ=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=Fz2vYFOG0lxlKyEzTEwuw37fuXnoTSFp/LbwQfj7LgWCiyUUNuOj3H7G/QPX//Iwm 6WAggfejZIIFEWLziaRIPaGcWyV8+mGxpGD/FihxjmlH3DtF/xB0nfIRXXYioTtTtX Vjq1SlnlFsJ0PYdTRQXRm256EyPhQNqp7OPNA0W1pMy3yUjWgzI+tb0kUkOCzu61w2 NNQ54L3KWjlG6ECLWpb3qiWqjn8r42fkztolx+FW7LTQLVvwWvtcu25l7HGGuk+yf5 Pwji7tALHdYB9ha0SKm4Rq1FlX8LsgemQLB4Bey/WZcAq6Vr5NHr+mTYEEamFzn6Aa /MkNiuafrqo9w== Received: by gandalf.ozlabs.org (Postfix, from userid 1007) id 4WFcNM2Qgyz4xQN; Fri, 5 Jul 2024 12:07:31 +1000 (AEST) From: David Gibson To: Stefano Brivio , passt-dev@passt.top Subject: [PATCH v7 24/27] udp: Direct datagrams from host to guest via flow table Date: Fri, 5 Jul 2024 12:07:21 +1000 Message-ID: <20240705020724.3447719-25-david@gibson.dropbear.id.au> X-Mailer: git-send-email 2.45.2 In-Reply-To: <20240705020724.3447719-1-david@gibson.dropbear.id.au> References: <20240705020724.3447719-1-david@gibson.dropbear.id.au> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Message-ID-Hash: ALGJOKJXF624LHHVYLNWPW5JIHRENMHY X-Message-ID-Hash: ALGJOKJXF624LHHVYLNWPW5JIHRENMHY X-MailFrom: dgibson@gandalf.ozlabs.org X-Mailman-Rule-Misses: dmarc-mitigation; no-senders; approved; emergency; loop; banned-address; member-moderation; nonmember-moderation; administrivia; implicit-dest; max-recipients; max-size; news-moderation; no-subject; digests; suspicious-header CC: jmaloy@redhat.com, David Gibson X-Mailman-Version: 3.3.8 Precedence: list List-Id: Development discussion and patches for passt Archived-At: Archived-At: List-Archive: List-Archive: List-Help: List-Owner: List-Post: List-Subscribe: List-Unsubscribe: This replaces the last piece of existing UDP port tracking with the common flow table. Specifically use the flow table to direct datagrams from host sockets to the guest tap interface. Since this now requires a flow for every datagram, we add some logging if we encounter any datagrams for which we can't find or create a flow. Signed-off-by: David Gibson --- flow_table.h | 14 ++++ udp.c | 188 +++++++++++++++------------------------------------ 2 files changed, 67 insertions(+), 135 deletions(-) diff --git a/flow_table.h b/flow_table.h index 1faac4a7..da9483b3 100644 --- a/flow_table.h +++ b/flow_table.h @@ -106,6 +106,20 @@ static inline uint8_t pif_at_sidx(flow_sidx_t sidx) return flow->f.pif[sidx.side]; } +/** flowside_at_sidx - Retrieve a specific flowside + * @sidx: Flow & side index + * + * Return: Flowside for the flow & side given by @sidx + */ +static inline const struct flowside *flowside_at_sidx(flow_sidx_t sidx) +{ + const union flow *flow = flow_at_sidx(sidx); + + if (!flow) + return PIF_NONE; + return &flow->f.side[sidx.side]; +} + /** flow_sidx_t - Index of one side of a flow from common structure * @f: Common flow fields pointer * @side: Which side to refer to (0 or 1) diff --git a/udp.c b/udp.c index a26ffe0c..7d63faf6 100644 --- a/udp.c +++ b/udp.c @@ -60,26 +60,6 @@ * flow will come to the reply socket in preference to a listening socket. The * sample program contrib/udp-reuseaddr/reuseaddr-priority.c documents and tests * that assumption. - * - * Port tracking - * ============= - * - * For UDP, a reduced version of port-based connection tracking is implemented - * with two purposes: - * - binding ephemeral ports when they're used as source port by the guest, so - * that replies on those ports can be forwarded back to the guest, with a - * fixed timeout for this binding - * - packets received from the local host get their source changed to a local - * address (gateway address) so that they can be forwarded to the guest, and - * packets sent as replies by the guest need their destination address to - * be changed back to the address of the local host. This is dynamic to allow - * connections from the gateway as well, and uses the same fixed 180s timeout - * - * Sockets for bound ports are created at initialisation time, one set for IPv4 - * and one for IPv6. - * - * Packets are forwarded back and forth, by prepending and stripping UDP headers - * in the obvious way, with no port translation. */ #include @@ -498,7 +478,6 @@ static flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, ASSERT(ref.type == EPOLL_TYPE_UDP); - /* FIXME: Match reply packets to their flow as well */ if (!ref.udp.orig) return FLOW_SIDX_NONE; @@ -558,160 +537,87 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n, /** * udp_update_hdr4() - Update headers for one IPv4 datagram - * @c: Execution context * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) - * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update - * @dstport: Destination port number + * @fside: Flowside with relevant addresses * @dlen: Length of UDP payload - * @now: Current timestamp * * Return: size of IPv4 payload (UDP header + data) */ -static size_t udp_update_hdr4(const struct ctx *c, - struct iphdr *ip4h, const struct sockaddr_in *s_in, - struct udp_payload_t *bp, - in_port_t dstport, size_t dlen, - const struct timespec *now) +static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, + const struct flowside *fside, size_t dlen) { - const struct in_addr dst = c->ip4.addr_seen; - in_port_t srcport = ntohs(s_in->sin_port); + const struct in_addr *src = inany_v4(&fside->faddr); + const struct in_addr *dst = inany_v4(&fside->eaddr); size_t l4len = dlen + sizeof(bp->uh); size_t l3len = l4len + sizeof(*ip4h); - struct in_addr src = s_in->sin_addr; - - if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) && - IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 && - (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) { - src = c->ip4.dns_match; - } else if (IN4_IS_ADDR_LOOPBACK(&src) || - IN4_ARE_ADDR_EQUAL(&src, &c->ip4.addr_seen)) { - udp_tap_map[V4][srcport].ts = now->tv_sec; - udp_tap_map[V4][srcport].flags |= PORT_LOCAL; - if (IN4_IS_ADDR_LOOPBACK(&src)) - udp_tap_map[V4][srcport].flags |= PORT_LOOPBACK; - else - udp_tap_map[V4][srcport].flags &= ~PORT_LOOPBACK; - - bitmap_set(udp_act[V4][UDP_ACT_TAP], srcport); - - src = c->ip4.gw; - } + ASSERT(src && dst); ip4h->tot_len = htons(l3len); - ip4h->daddr = dst.s_addr; - ip4h->saddr = src.s_addr; - ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst); + ip4h->daddr = dst->s_addr; + ip4h->saddr = src->s_addr; + ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, *src, *dst); - bp->uh.source = s_in->sin_port; - bp->uh.dest = htons(dstport); + bp->uh.source = htons(fside->fport); + bp->uh.dest = htons(fside->eport); bp->uh.len = htons(l4len); - csum_udp4(&bp->uh, src, dst, bp->data, dlen); + csum_udp4(&bp->uh, *src, *dst, bp->data, dlen); return l4len; } /** * udp_update_hdr6() - Update headers for one IPv6 datagram - * @c: Execution context * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) - * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update - * @dstport: Destination port number + * @fside: Flowside with relevant addresses * @dlen: Length of UDP payload - * @now: Current timestamp * * Return: size of IPv6 payload (UDP header + data) */ -static size_t udp_update_hdr6(const struct ctx *c, - struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, - struct udp_payload_t *bp, - in_port_t dstport, size_t dlen, - const struct timespec *now) +static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, + const struct flowside *fside, size_t dlen) { - const struct in6_addr *src = &s_in6->sin6_addr; - const struct in6_addr *dst = &c->ip6.addr_seen; - in_port_t srcport = ntohs(s_in6->sin6_port); uint16_t l4len = dlen + sizeof(bp->uh); - if (IN6_IS_ADDR_LINKLOCAL(src)) { - dst = &c->ip6.addr_ll_seen; - } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match) && - IN6_ARE_ADDR_EQUAL(src, &c->ip6.dns_host) && - srcport == 53 && - (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) { - src = &c->ip6.dns_match; - } else if (IN6_IS_ADDR_LOOPBACK(src) || - IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr_seen) || - IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) { - udp_tap_map[V6][srcport].ts = now->tv_sec; - udp_tap_map[V6][srcport].flags |= PORT_LOCAL; - - if (IN6_IS_ADDR_LOOPBACK(src)) - udp_tap_map[V6][srcport].flags |= PORT_LOOPBACK; - else - udp_tap_map[V6][srcport].flags &= ~PORT_LOOPBACK; - - if (IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) - udp_tap_map[V6][srcport].flags |= PORT_GUA; - else - udp_tap_map[V6][srcport].flags &= ~PORT_GUA; - - bitmap_set(udp_act[V6][UDP_ACT_TAP], srcport); - - dst = &c->ip6.addr_ll_seen; - - if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) - src = &c->ip6.gw; - else - src = &c->ip6.addr_ll; - - } - ip6h->payload_len = htons(l4len); - ip6h->daddr = *dst; - ip6h->saddr = *src; + ip6h->daddr = fside->eaddr.a6; + ip6h->saddr = fside->faddr.a6; ip6h->version = 6; ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = 255; - bp->uh.source = s_in6->sin6_port; - bp->uh.dest = htons(dstport); + bp->uh.source = htons(fside->fport); + bp->uh.dest = htons(fside->eport); bp->uh.len = ip6h->payload_len; - csum_udp6(&bp->uh, src, dst, bp->data, dlen); + csum_udp6(&bp->uh, &fside->faddr.a6, &fside->eaddr.a6, bp->data, dlen); return l4len; } /** * udp_tap_prepare() - Convert one datagram into a tap frame - * @c: Execution context * @mmh: Receiving mmsghdr array * @idx: Index of the datagram to prepare - * @dstport: Destination port - * @v6: Prepare for IPv6? - * @now: Current timestamp + * @fside: flowside for destination side */ -static void udp_tap_prepare(const struct ctx *c, const struct mmsghdr *mmh, - unsigned idx, in_port_t dstport, bool v6, - const struct timespec *now) +static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx, + const struct flowside *fside) { struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx]; struct udp_payload_t *bp = &udp_payload[idx]; struct udp_meta_t *bm = &udp_meta[idx]; size_t l4len; - if (v6) { - l4len = udp_update_hdr6(c, &bm->ip6h, &bm->s_in.sa6, bp, - dstport, mmh[idx].msg_len, now); + if (!inany_v4(&fside->eaddr) || !inany_v4(&fside->faddr)) { + l4len = udp_update_hdr6(&bm->ip6h, bp, fside, mmh[idx].msg_len); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); } else { - l4len = udp_update_hdr4(c, &bm->ip4h, &bm->s_in.sa4, bp, - dstport, mmh[idx].msg_len, now); + l4len = udp_update_hdr4(&bm->ip4h, bp, fside, mmh[idx].msg_len); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + sizeof(udp4_eth_hdr)); (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); @@ -766,17 +672,11 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve const struct timespec *now) { struct mmsghdr *mmh_recv = ref.udp.v6 ? udp6_mh_recv : udp4_mh_recv; - in_port_t dstport = ref.udp.port; int n, i; if ((n = udp_sock_recv(c, ref.fd, events, mmh_recv)) <= 0) return; - if (ref.udp.pif == PIF_SPLICE) - dstport += c->udp.fwd_out.f.delta[dstport]; - else if (ref.udp.pif == PIF_HOST) - dstport += c->udp.fwd_in.f.delta[dstport]; - /* We divide datagrams into batches based on how we need to send them, * determined by udp_meta[i].tosidx. To avoid either two passes through * the array, or recalculating tosidx for a single entry, we have to @@ -791,9 +691,9 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve do { if (pif_is_socket(batchpif)) udp_splice_prepare(mmh_recv, i); - else - udp_tap_prepare(c, mmh_recv, i, dstport, - ref.udp.v6, now); + else if (batchpif == PIF_TAP) + udp_tap_prepare(mmh_recv, i, + flowside_at_sidx(batchsidx)); if (++i >= n) break; @@ -803,12 +703,24 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve now); } while (flow_sidx_eq(udp_meta[i].tosidx, batchsidx)); - if (pif_is_socket(batchpif)) + if (pif_is_socket(batchpif)) { udp_splice_send(c, batchstart, i - batchstart, batchsidx); - else + } else if (batchpif == PIF_TAP) { tap_send_frames(c, &udp_l2_iov[batchstart][0], UDP_NUM_IOVS, i - batchstart); + } else if (flow_sidx_valid(batchsidx)) { + flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx); + struct udp_flow *uflow = udp_at_sidx(batchsidx); + + flow_err(uflow, + "No support for forwarding UDP from %s to %s", + pif_name(pif_at_sidx(fromsidx)), + pif_name(batchpif)); + } else { + debug("Discarding %d datagrams without flow", + i - batchstart); + } } } @@ -845,14 +757,20 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, for (i = 0; i < n; i++) { if (pif_is_socket(topif)) udp_splice_prepare(mmh_recv, i); - else - udp_tap_prepare(c, mmh_recv, i, toside->eport, v6, now); + else if (topif == PIF_TAP) + udp_tap_prepare(mmh_recv, i, toside); } - if (pif_is_socket(topif)) + if (pif_is_socket(topif)) { udp_splice_send(c, 0, n, tosidx); - else + } else if (topif == PIF_TAP) { tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n); + } else { + uint8_t frompif = uflow->f.pif[ref.flowside.side]; + + flow_err(uflow, "No support for forwarding UDP from %s to %s", + pif_name(frompif), pif_name(topif)); + } } /** -- 2.45.2