// SPDX-License-Identifier: GPL-2.0-or-later /* PASST - Plug A Simple Socket Transport * for qemu/UNIX domain socket mode * * PASTA - Pack A Subtle Tap Abstraction * for network namespace/tap device mode * * udp.c - UDP L2-L4 translation routines * * Copyright (c) 2020-2021 Red Hat GmbH * Author: Stefano Brivio */ /** * DOC: Theory of Operation * * Flow Table * ========== * * UDP does not have connections, but to reliably forward reply packets back to * the original requested, we must keep track of pseudo-connections. We do this * via the generic flow table. * * - Finding an existing flow * * When we receive a datagram we attempt to match it to an existing flow: one * with matching interface, addresses and ports (both forwarding and * endpoint). For socket interfaces, we treat the forwarding address as the * bound address of the receiving socket, which may be unspecified, rather * than the datagram's actual destination address (which is awkward to * determine for unbound sockets). * * - Creating a new flow * * If no matching flow exists, and the datagram comes either from the tap * interface, or from a socket with the 'orig' flag set we create a new one. * The initiating side records the interface, endpoint and forwarding * addresses and ports of this first datagram. Again, we treat the forwarding * address for sockets as the socket's bound address, regardless of the * datagram's actual destination. * * The target side interface and addresses are assigned by the general code in * fwd.c. When the target is a socket interface, the target forwarding * address may be left unspecified - in this case, the kernel will determine * the source address when we send the datagram. * * - Flow expiry * * Every time a datagram is received that matches a flow (or creates a new * one), we update the flow's timestamp to the current time. Periodically we * scan flows and those which are older than UDP_CONN_TIMEOUT (180s) are * removed. * * - Locating or creating an outgoing socket * * When forwarding to a socket based interface, we need to find a suitable * socket to send via. Generally this should have a bound address and port * matching the forwarding address and port of the flowside for the outgoing * datagram. However, if we have an existing socket with a matching port and * an "any" address, we need to use that (in that case a socket with a * specific bound address would conflict). * * FIXME: currently we don't perform this lookup correctly. Instead we abuse * the fact that it's rare to have multiple flows with the same forwarding * address but different forwarding port. We store at most a single socket * per per bound port number (and IP version). For datagrams forwarded from * PIF_TAP to PIF_HOST these are in udp_tap_map[]. * * For ports where port forwarding is configured (-u option) a socket is * opened during start up, bound to the specified forwarding address and * stored in udp_tap_map[]. For other ports we open a socket when we first * need to forward a datagram from that port, bound to the configured outbound * address (which may be "any"). * * Port Tracking * ============= * * For datagrams not handled by the flow table, a reduced version of port-based * connection tracking is implemented with two purposes: * - binding ephemeral ports when they're used as source port by the guest, so * that replies on those ports can be forwarded back to the guest, with a * fixed timeout for this binding * - packets received from the local host get their source changed to a local * address (gateway address) so that they can be forwarded to the guest, and * packets sent as replies by the guest need their destination address to * be changed back to the address of the local host. This is dynamic to allow * connections from the gateway as well, and uses the same fixed 180s timeout * * Sockets for bound ports are created at initialisation time, one set for IPv4 * and one for IPv6. * * Packets are forwarded back and forth, by prepending and stripping UDP headers * in the obvious way, with no port translation. * * In PASTA mode, the L2-L4 translation is skipped for connections to ports * bound between namespaces using the loopback interface, messages are directly * transferred between L4 sockets instead. These are called spliced connections * for consistency with the TCP implementation, but the splice() syscall isn't * actually used as it wouldn't make sense for datagram-based connections: a * pair of recvmmsg() and sendmmsg() deals with this case. * * The connection tracking for PASTA mode is slightly complicated by the absence * of actual connections, see struct udp_splice_port, and these examples: * * - from init to namespace: * * - forward direction: 127.0.0.1:5000 -> 127.0.0.1:80 in init from socket s, * with epoll reference: index = 80, splice = 1, orig = 1, ns = 0 * - if udp_splice_ns[V4][5000].sock: * - send packet to udp_splice_ns[V4][5000].sock, with destination port * 80 * - otherwise: * - create new socket udp_splice_ns[V4][5000].sock * - bind in namespace to 127.0.0.1:5000 * - add to epoll with reference: index = 5000, splice = 1, orig = 0, * ns = 1 * - update udp_splice_init[V4][80].ts and udp_splice_ns[V4][5000].ts with * current time * * - reverse direction: 127.0.0.1:80 -> 127.0.0.1:5000 in namespace socket s, * having epoll reference: index = 5000, splice = 1, orig = 0, ns = 1 * - if udp_splice_init[V4][80].sock: * - send to udp_splice_init[V4][80].sock, with destination port 5000 * - update udp_splice_init[V4][80].ts and udp_splice_ns[V4][5000].ts with * current time * - otherwise, discard * * - from namespace to init: * * - forward direction: 127.0.0.1:2000 -> 127.0.0.1:22 in namespace from * socket s, with epoll reference: index = 22, splice = 1, orig = 1, ns = 1 * - if udp4_splice_init[V4][2000].sock: * - send packet to udp_splice_init[V4][2000].sock, with destination * port 22 * - otherwise: * - create new socket udp_splice_init[V4][2000].sock * - bind in init to 127.0.0.1:2000 * - add to epoll with reference: index = 2000, splice = 1, orig = 0, * ns = 0 * - update udp_splice_ns[V4][22].ts and udp_splice_init[V4][2000].ts with * current time * * - reverse direction: 127.0.0.1:22 -> 127.0.0.1:2000 in init from socket s, * having epoll reference: index = 2000, splice = 1, orig = 0, ns = 0 * - if udp_splice_ns[V4][22].sock: * - send to udp_splice_ns[V4][22].sock, with destination port 2000 * - update udp_splice_ns[V4][22].ts and udp_splice_init[V4][2000].ts with * current time * - otherwise, discard */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "checksum.h" #include "util.h" #include "iov.h" #include "ip.h" #include "siphash.h" #include "inany.h" #include "passt.h" #include "tap.h" #include "pcap.h" #include "log.h" #include "flow_table.h" #define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ /** * struct udp_tap_port - Port tracking based on tap-facing source port * @sock: Socket bound to source port used as index * @flags: Flags for recent activity type seen from/to port * @ts: Activity timestamp from tap, used for socket aging */ struct udp_tap_port { int sock; uint8_t flags; #define PORT_LOCAL BIT(0) /* Port was contacted from local address */ #define PORT_LOOPBACK BIT(1) /* Port was contacted from loopback address */ #define PORT_GUA BIT(2) /* Port was contacted from global unicast */ #define PORT_DNS_FWD BIT(3) /* Port used as source for DNS remapped query */ time_t ts; }; /** * struct udp_splice_port - Bound socket for spliced communication * @sock: Socket bound to index port * @ts: Activity timestamp */ struct udp_splice_port { int sock; time_t ts; }; /* Port tracking, arrays indexed by packet source port (host order) */ static struct udp_tap_port udp_tap_map [IP_VERSIONS][NUM_PORTS]; /* "Spliced" sockets indexed by bound port (host order) */ static struct udp_splice_port udp_splice_ns [IP_VERSIONS][NUM_PORTS]; static struct udp_splice_port udp_splice_init[IP_VERSIONS][NUM_PORTS]; enum udp_act_type { UDP_ACT_TAP, UDP_ACT_SPLICE_NS, UDP_ACT_SPLICE_INIT, UDP_ACT_TYPE_MAX, }; /* Activity-based aging for bindings */ static uint8_t udp_act[IP_VERSIONS][UDP_ACT_TYPE_MAX][DIV_ROUND_UP(NUM_PORTS, 8)]; /* Static buffers */ /** * struct udp_payload_t - UDP header and data for inbound messages * @uh: UDP header * @data: UDP data */ static struct udp_payload_t { struct udphdr uh; char data[USHRT_MAX - sizeof(struct udphdr)]; #ifdef __AVX2__ } __attribute__ ((packed, aligned(32))) #else } __attribute__ ((packed, aligned(__alignof__(unsigned int)))) #endif udp_payload[UDP_MAX_FRAMES]; /* Ethernet header for IPv4 frames */ static struct ethhdr udp4_eth_hdr; /* Ethernet header for IPv6 frames */ static struct ethhdr udp6_eth_hdr; /** * struct udp_meta_t - Pre-cooked headers and metadata for UDP packets * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) * @taph: Tap backend specific header * @s_in: Source socket address, filled in by recvmmsg() * @splicesrc: Source port for splicing, or -1 if not spliceable * @tosidx: sidx for the destination side of this datagram's flow */ static struct udp_meta_t { struct ipv6hdr ip6h; struct iphdr ip4h; struct tap_hdr taph; union sockaddr_inany s_in; int splicesrc; flow_sidx_t tosidx; } #ifdef __AVX2__ __attribute__ ((aligned(32))) #endif udp_meta[UDP_MAX_FRAMES]; /** * enum udp_iov_idx - Indices for the buffers making up a single UDP frame * @UDP_IOV_TAP tap specific header * @UDP_IOV_ETH Ethernet header * @UDP_IOV_IP IP (v4/v6) header * @UDP_IOV_PAYLOAD IP payload (UDP header + data) * @UDP_NUM_IOVS the number of entries in the iovec array */ enum udp_iov_idx { UDP_IOV_TAP = 0, UDP_IOV_ETH = 1, UDP_IOV_IP = 2, UDP_IOV_PAYLOAD = 3, UDP_NUM_IOVS }; /* recvmmsg()/sendmmsg() data for tap */ static struct iovec udp_l2_iov_sock [UDP_MAX_FRAMES]; static struct iovec udp4_l2_iov_tap [UDP_MAX_FRAMES][UDP_NUM_IOVS]; static struct iovec udp6_l2_iov_tap [UDP_MAX_FRAMES][UDP_NUM_IOVS]; static struct mmsghdr udp4_l2_mh_sock [UDP_MAX_FRAMES]; static struct mmsghdr udp6_l2_mh_sock [UDP_MAX_FRAMES]; /* recvmmsg()/sendmmsg() data for "spliced" connections */ static struct iovec udp_iov_splice [UDP_MAX_FRAMES]; static struct sockaddr_in udp4_localname = { .sin_family = AF_INET, .sin_addr = IN4ADDR_LOOPBACK_INIT, }; static struct sockaddr_in6 udp6_localname = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_LOOPBACK_INIT, }; static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES]; static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES]; struct udp_flow *udp_at_sidx(flow_sidx_t sidx) { union flow *flow = flow_at_sidx(sidx); if (!flow) return NULL; ASSERT(flow->f.type == FLOW_UDP); return &flow->udp; } /** * udp_portmap_clear() - Clear UDP port map before configuration */ void udp_portmap_clear(void) { unsigned i; for (i = 0; i < NUM_PORTS; i++) { udp_tap_map[V4][i].sock = udp_tap_map[V6][i].sock = -1; udp_splice_ns[V4][i].sock = udp_splice_ns[V6][i].sock = -1; udp_splice_init[V4][i].sock = udp_splice_init[V6][i].sock = -1; } } /** * udp_invert_portmap() - Compute reverse port translations for return packets * @fwd: Port forwarding configuration to compute reverse map for */ static void udp_invert_portmap(struct udp_fwd_ports *fwd) { unsigned int i; static_assert(ARRAY_SIZE(fwd->f.delta) == ARRAY_SIZE(fwd->rdelta), "Forward and reverse delta arrays must have same size"); for (i = 0; i < ARRAY_SIZE(fwd->f.delta); i++) { in_port_t delta = fwd->f.delta[i]; if (delta) fwd->rdelta[i + delta] = NUM_PORTS - delta; } } /** * udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses * @eth_d: Ethernet destination address, NULL if unchanged * @eth_s: Ethernet source address, NULL if unchanged */ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s) { eth_update_mac(&udp4_eth_hdr, eth_d, eth_s); eth_update_mac(&udp6_eth_hdr, eth_d, eth_s); } /** * udp_iov_init_one() - Initialise scatter-gather lists for one buffer * @c: Execution context * @i: Index of buffer to initialize */ static void udp_iov_init_one(const struct ctx *c, size_t i) { struct udp_payload_t *payload = &udp_payload[i]; struct iovec *siov = &udp_l2_iov_sock[i]; struct udp_meta_t *meta = &udp_meta[i]; *meta = (struct udp_meta_t) { .ip4h = L2_BUF_IP4_INIT(IPPROTO_UDP), .ip6h = L2_BUF_IP6_INIT(IPPROTO_UDP), }; *siov = IOV_OF_LVALUE(payload->data); udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP); udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6); if (c->ifi4) { struct msghdr *mh = &udp4_l2_mh_sock[i].msg_hdr; struct iovec *tiov = udp4_l2_iov_tap[i]; mh->msg_name = &meta->s_in; mh->msg_namelen = sizeof(struct sockaddr_in); mh->msg_iov = siov; mh->msg_iovlen = 1; tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); tiov[UDP_IOV_IP] = IOV_OF_LVALUE(meta->ip4h); tiov[UDP_IOV_PAYLOAD].iov_base = payload; } if (c->ifi6) { struct msghdr *mh = &udp6_l2_mh_sock[i].msg_hdr; struct iovec *tiov = udp6_l2_iov_tap[i]; mh->msg_name = &meta->s_in; mh->msg_namelen = sizeof(struct sockaddr_in6); mh->msg_iov = siov; mh->msg_iovlen = 1; tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph); tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); tiov[UDP_IOV_IP] = IOV_OF_LVALUE(meta->ip6h); tiov[UDP_IOV_PAYLOAD].iov_base = payload; } } /** * udp_iov_init() - Initialise scatter-gather L2 buffers * @c: Execution context */ static void udp_iov_init(const struct ctx *c) { size_t i; for (i = 0; i < UDP_MAX_FRAMES; i++) udp_iov_init_one(c, i); } /** * udp_splice_new() - Create and prepare socket for "spliced" binding * @c: Execution context * @v6: Set for IPv6 sockets * @src: Source port of original connection, host order * @ns: Does the splice originate in the ns or not * * Return: prepared socket, negative error code on failure * * #syscalls:pasta getsockname */ int udp_splice_new(const struct ctx *c, int v6, in_port_t src, bool ns) { struct epoll_event ev = { .events = EPOLLIN | EPOLLRDHUP | EPOLLHUP }; union epoll_ref ref = { .type = EPOLL_TYPE_UDP, .udp = { .splice = true, .v6 = v6, .port = src } }; struct udp_splice_port *sp; int act, s; if (ns) { ref.udp.pif = PIF_SPLICE; sp = &udp_splice_ns[v6 ? V6 : V4][src]; act = UDP_ACT_SPLICE_NS; } else { ref.udp.pif = PIF_HOST; sp = &udp_splice_init[v6 ? V6 : V4][src]; act = UDP_ACT_SPLICE_INIT; } s = socket(v6 ? AF_INET6 : AF_INET, SOCK_DGRAM | SOCK_NONBLOCK, IPPROTO_UDP); if (s > FD_REF_MAX) { close(s); return -EIO; } if (s < 0) return s; ref.fd = s; if (v6) { struct sockaddr_in6 addr6 = { .sin6_family = AF_INET6, .sin6_port = htons(src), .sin6_addr = IN6ADDR_LOOPBACK_INIT, }; if (bind(s, (struct sockaddr *)&addr6, sizeof(addr6))) goto fail; } else { struct sockaddr_in addr4 = { .sin_family = AF_INET, .sin_port = htons(src), .sin_addr = IN4ADDR_LOOPBACK_INIT, }; if (bind(s, (struct sockaddr *)&addr4, sizeof(addr4))) goto fail; } sp->sock = s; bitmap_set(udp_act[v6 ? V6 : V4][act], src); ev.data.u64 = ref.u64; epoll_ctl(c->epollfd, EPOLL_CTL_ADD, s, &ev); return s; fail: close(s); return -1; } /** * struct udp_splice_new_ns_arg - Arguments for udp_splice_new_ns() * @c: Execution context * @v6: Set for IPv6 * @src: Source port of originating datagram, host order * @dst: Destination port of originating datagram, host order * @s: Newly created socket or negative error code */ struct udp_splice_new_ns_arg { const struct ctx *c; int v6; in_port_t src; int s; }; /** * udp_splice_new_ns() - Enter namespace and call udp_splice_new() * @arg: See struct udp_splice_new_ns_arg * * Return: 0 */ static int udp_splice_new_ns(void *arg) { struct udp_splice_new_ns_arg *a; a = (struct udp_splice_new_ns_arg *)arg; ns_enter(a->c); a->s = udp_splice_new(a->c, a->v6, a->src, true); return 0; } /** * udp_mmh_splice_port() - Is source address of message suitable for splicing? * @uref: UDP epoll reference for incoming message's origin socket * @mmh: mmsghdr of incoming message * * Return: if source address of message in @mmh refers to localhost (127.0.0.1 * or ::1) its source port (host order), otherwise -1. */ static int udp_mmh_splice_port(union udp_epoll_ref uref, const struct mmsghdr *mmh) { const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name; const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name; if (!uref.splice) return -1; if (uref.v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) return ntohs(sa6->sin6_port); if (!uref.v6 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr)) return ntohs(sa4->sin_port); return -1; } /** * udp_flow_from_sock() - Find or create UDP flow for datagrams from socket * @c: Execution context * @uref: UDP epoll reference of the originating socket * @meta: Metadata buffer for the datagram * * Return: sidx for the destination side of the flow for this packet, or * FLOW_SIDX_NONE if we couldn't find or create a flow. */ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union udp_epoll_ref uref, struct udp_meta_t *meta) { char sstr[INANY_ADDRSTRLEN]; const struct flowside *ini; struct udp_flow *uflow; union flow *flow; flow_sidx_t sidx; sidx = flow_lookup_sa(c, IPPROTO_UDP, uref.pif, &meta->s_in, uref.port); if ((flow = flow_at_sidx(sidx))) return FLOW_SIDX(flow, !sidx.side); if (!uref.orig) return FLOW_SIDX_NONE; if (!(flow = flow_alloc())) { char sastr[SOCKADDR_STRLEN]; debug("Couldn't allocate flow for UDP datagram from %s %s", pif_name(uref.pif), sockaddr_ntop(&meta->s_in, sastr, sizeof(sastr))); return FLOW_SIDX_NONE; } ini = flow_initiate_sa(flow, uref.pif, &meta->s_in, uref.port); if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { flow_dbg(flow, "Invalid endpoint on UDP recv()"); /* Invalid endpoint */ goto cancel; } if (!flow_target(c, flow, IPPROTO_UDP)) goto cancel; uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE)); flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE)); FLOW_ACTIVATE(uflow); return FLOW_SIDX(uflow, TGTSIDE); cancel: flow_dbg(flow, "Couldn't create UDP flow for %s [%s]:%hu -> ?:%hu", pif_name(uref.pif), inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport, ini->fport); flow_alloc_cancel(flow); return FLOW_SIDX_NONE; } /** * udp_splice_send() - Send datagrams from socket to socket * @c: Execution context * @start: Index of first datagram in udp[46]_l2_buf * @n: Total number of datagrams in udp[46]_l2_buf pool * @dst: Datagrams will be sent to this port (on destination side) * @uref: UDP epoll reference for origin socket * @now: Timestamp * * This consumes as many datagrams as are sendable via a single socket. It * requires that udp_meta[@start].splicesrc is initialised, and will initialise * udp_meta[].splicesrc for each datagram it consumes *and one more* (if * present). * * Return: Number of datagrams forwarded */ static unsigned udp_splice_send(const struct ctx *c, size_t start, size_t n, in_port_t dst, union udp_epoll_ref uref, const struct timespec *now) { in_port_t src = udp_meta[start].splicesrc; struct mmsghdr *mmh_recv, *mmh_send; unsigned int i = start; int s; ASSERT(udp_meta[start].splicesrc >= 0); if (uref.v6) { mmh_recv = udp6_l2_mh_sock; mmh_send = udp6_mh_splice; udp6_localname.sin6_port = htons(dst); } else { mmh_recv = udp4_l2_mh_sock; mmh_send = udp4_mh_splice; udp4_localname.sin_port = htons(dst); } do { mmh_send[i].msg_hdr.msg_iov->iov_len = mmh_recv[i].msg_len; if (++i >= n) break; udp_meta[i].splicesrc = udp_mmh_splice_port(uref, &mmh_recv[i]); udp_meta[i].tosidx = udp_flow_from_sock(c, uref, &udp_meta[i]); } while (udp_meta[i].splicesrc == src); if (uref.pif == PIF_SPLICE) { src += c->udp.fwd_in.rdelta[src]; s = udp_splice_init[uref.v6][src].sock; if (s < 0 && uref.orig) s = udp_splice_new(c, uref.v6, src, false); if (s < 0) goto out; udp_splice_ns[uref.v6][dst].ts = now->tv_sec; udp_splice_init[uref.v6][src].ts = now->tv_sec; } else { ASSERT(uref.pif == PIF_HOST); src += c->udp.fwd_out.rdelta[src]; s = udp_splice_ns[uref.v6][src].sock; if (s < 0 && uref.orig) { struct udp_splice_new_ns_arg arg = { c, uref.v6, src, -1, }; NS_CALL(udp_splice_new_ns, &arg); s = arg.s; } if (s < 0) goto out; udp_splice_init[uref.v6][dst].ts = now->tv_sec; udp_splice_ns[uref.v6][src].ts = now->tv_sec; } sendmmsg(s, mmh_send + start, i - start, MSG_NOSIGNAL); out: return i - start; } /** * udp_update_hdr4() - Update headers for one IPv4 datagram * @c: Execution context * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr) * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update * @dstport: Destination port number * @dlen: Length of UDP payload * @now: Current timestamp * * Return: size of IPv4 payload (UDP header + data) */ static size_t udp_update_hdr4(const struct ctx *c, struct iphdr *ip4h, const struct sockaddr_in *s_in, struct udp_payload_t *bp, in_port_t dstport, size_t dlen, const struct timespec *now) { const struct in_addr dst = c->ip4.addr_seen; in_port_t srcport = ntohs(s_in->sin_port); size_t l4len = dlen + sizeof(bp->uh); size_t l3len = l4len + sizeof(*ip4h); struct in_addr src = s_in->sin_addr; if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) && IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 && (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) { src = c->ip4.dns_match; } else if (IN4_IS_ADDR_LOOPBACK(&src) || IN4_ARE_ADDR_EQUAL(&src, &c->ip4.addr_seen)) { udp_tap_map[V4][srcport].ts = now->tv_sec; udp_tap_map[V4][srcport].flags |= PORT_LOCAL; if (IN4_IS_ADDR_LOOPBACK(&src)) udp_tap_map[V4][srcport].flags |= PORT_LOOPBACK; else udp_tap_map[V4][srcport].flags &= ~PORT_LOOPBACK; bitmap_set(udp_act[V4][UDP_ACT_TAP], srcport); src = c->ip4.gw; } ip4h->tot_len = htons(l3len); ip4h->daddr = dst.s_addr; ip4h->saddr = src.s_addr; ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst); bp->uh.source = s_in->sin_port; bp->uh.dest = htons(dstport); bp->uh.len = htons(l4len); csum_udp4(&bp->uh, src, dst, bp->data, dlen); return l4len; } /** * udp_update_hdr6() - Update headers for one IPv6 datagram * @c: Execution context * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses) * @s_in: Source socket address, filled in by recvmmsg() * @bp: Pointer to udp_payload_t to update * @dstport: Destination port number * @dlen: Length of UDP payload * @now: Current timestamp * * Return: size of IPv6 payload (UDP header + data) */ static size_t udp_update_hdr6(const struct ctx *c, struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6, struct udp_payload_t *bp, in_port_t dstport, size_t dlen, const struct timespec *now) { const struct in6_addr *src = &s_in6->sin6_addr; const struct in6_addr *dst = &c->ip6.addr_seen; in_port_t srcport = ntohs(s_in6->sin6_port); uint16_t l4len = dlen + sizeof(bp->uh); if (IN6_IS_ADDR_LINKLOCAL(src)) { dst = &c->ip6.addr_ll_seen; } else if (!IN6_IS_ADDR_UNSPECIFIED(&c->ip6.dns_match) && IN6_ARE_ADDR_EQUAL(src, &c->ip6.dns_host) && srcport == 53 && (udp_tap_map[V4][dstport].flags & PORT_DNS_FWD)) { src = &c->ip6.dns_match; } else if (IN6_IS_ADDR_LOOPBACK(src) || IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr_seen) || IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) { udp_tap_map[V6][srcport].ts = now->tv_sec; udp_tap_map[V6][srcport].flags |= PORT_LOCAL; if (IN6_IS_ADDR_LOOPBACK(src)) udp_tap_map[V6][srcport].flags |= PORT_LOOPBACK; else udp_tap_map[V6][srcport].flags &= ~PORT_LOOPBACK; if (IN6_ARE_ADDR_EQUAL(src, &c->ip6.addr)) udp_tap_map[V6][srcport].flags |= PORT_GUA; else udp_tap_map[V6][srcport].flags &= ~PORT_GUA; bitmap_set(udp_act[V6][UDP_ACT_TAP], srcport); dst = &c->ip6.addr_ll_seen; if (IN6_IS_ADDR_LINKLOCAL(&c->ip6.gw)) src = &c->ip6.gw; else src = &c->ip6.addr_ll; } ip6h->payload_len = htons(l4len); ip6h->daddr = *dst; ip6h->saddr = *src; ip6h->version = 6; ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = 255; bp->uh.source = s_in6->sin6_port; bp->uh.dest = htons(dstport); bp->uh.len = ip6h->payload_len; csum_udp6(&bp->uh, src, dst, bp->data, dlen); return l4len; } /** * udp_tap_send() - Prepare UDP datagrams and send to tap interface * @c: Execution context * @start: Index of first datagram in udp[46]_l2_buf pool * @n: Total number of datagrams in udp[46]_l2_buf pool * @dstport: Destination port number on destination side * @uref: UDP epoll reference for origin socket * @now: Current timestamp * * This consumes as many frames as are sendable via tap. It requires that * udp_meta[@start].splicesrc is initialised, and will initialise * udp_meta[].splicesrc for each frame it consumes *and one more* (if present). * * Return: Number of frames sent via tap */ static unsigned udp_tap_send(const struct ctx *c, size_t start, size_t n, in_port_t dstport, union udp_epoll_ref uref, const struct timespec *now) { struct iovec (*tap_iov)[UDP_NUM_IOVS]; struct mmsghdr *mmh_recv; size_t i = start; ASSERT(udp_meta[start].splicesrc == -1); if (uref.v6) { tap_iov = udp6_l2_iov_tap; mmh_recv = udp6_l2_mh_sock; } else { mmh_recv = udp4_l2_mh_sock; tap_iov = udp4_l2_iov_tap; } do { struct udp_payload_t *bp = &udp_payload[i]; struct udp_meta_t *bm = &udp_meta[i]; size_t l4len; if (uref.v6) { l4len = udp_update_hdr6(c, &bm->ip6h, &bm->s_in.sa6, bp, dstport, udp6_l2_mh_sock[i].msg_len, now); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr)); } else { l4len = udp_update_hdr4(c, &bm->ip4h, &bm->s_in.sa4, bp, dstport, udp4_l2_mh_sock[i].msg_len, now); tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + sizeof(udp4_eth_hdr)); } tap_iov[i][UDP_IOV_PAYLOAD].iov_len = l4len; if (++i >= n) break; udp_meta[i].splicesrc = udp_mmh_splice_port(uref, &mmh_recv[i]); udp_meta[i].tosidx = udp_flow_from_sock(c, uref, &udp_meta[i]); } while (udp_meta[i].splicesrc == -1); tap_send_frames(c, &tap_iov[start][0], UDP_NUM_IOVS, i - start); return i - start; } /** * udp_buf_sock_handler() - Handle new data from socket * @c: Execution context * @ref: epoll reference * @events: epoll events bitmap * @now: Current timestamp * * #syscalls recvmmsg */ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events, const struct timespec *now) { /* For not entirely clear reasons (data locality?) pasta gets * better throughput if we receive tap datagrams one at a * atime. For small splice datagrams throughput is slightly * better if we do batch, but it's slightly worse for large * splice datagrams. Since we don't know before we receive * whether we'll use tap or splice, always go one at a time * for pasta mode. */ ssize_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES); in_port_t dstport = ref.udp.port; struct mmsghdr *mmh_recv; bool v6 = ref.udp.v6; int i, m; if (c->no_udp || !(events & EPOLLIN)) return; if (ref.udp.pif == PIF_SPLICE) dstport += c->udp.fwd_out.f.delta[dstport]; else if (ref.udp.pif == PIF_HOST) dstport += c->udp.fwd_in.f.delta[dstport]; else ASSERT(0); if (v6) mmh_recv = udp6_l2_mh_sock; else mmh_recv = udp4_l2_mh_sock; n = recvmmsg(ref.fd, mmh_recv, n, 0, NULL); if (n <= 0) return; /* We divide things into batches based on how we need to send them, * determined by udp_meta[i].splicesrc. To avoid either two passes * through the array, or recalculating splicesrc and tosidx for a single * entry, we have to populate them one entry *ahead* of the loop counter * (if present). So we fill in entry 0 before the loop, then * udp_*_send() populate one entry past where they consume. */ udp_meta[0].splicesrc = udp_mmh_splice_port(ref.udp, mmh_recv); udp_meta[0].tosidx = udp_flow_from_sock(c, ref.udp, &udp_meta[0]); for (i = 0; i < n; i += m) { if (udp_meta[i].splicesrc >= 0) m = udp_splice_send(c, i, n, dstport, ref.udp, now); else m = udp_tap_send(c, i, n, dstport, ref.udp, now); } } /** * udp_flow_from_tap() - Find or create UDP flow for tap packets * @c: Execution context * @pif: pif on which the packet is arriving * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address on guest side * @daddr: Destination address guest side * @srcport: Source port on guest side * @dstport: Destination port on guest side * * Return: sidx for the destination side of the flow for this packet, or * FLOW_SIDX_NONE if we couldn't find or create a flow. */ flow_sidx_t udp_flow_from_tap(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, in_port_t srcport, in_port_t dstport) { const struct flowside *ini; struct udp_flow *uflow; union flow *flow; flow_sidx_t sidx; ASSERT(pif == PIF_TAP); sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr, srcport, dstport); if ((flow = flow_at_sidx(sidx))) return FLOW_SIDX(flow, !sidx.side); if (!(flow = flow_alloc())) return FLOW_SIDX_NONE; ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport); if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0 || !inany_is_unicast(&ini->faddr) || ini->fport == 0) { char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN]; debug("Invalid UDP endpoint from %s: %s:%hu -> %s:%hu", pif_name(pif), inany_ntop(&ini->eaddr, sstr, sizeof(sstr)), ini->eport, inany_ntop(&ini->faddr, dstr, sizeof(dstr)), ini->fport); goto cancel; } if (!flow_target(c, flow, IPPROTO_UDP)) goto cancel; if (flow->f.pif[TGTSIDE] != PIF_HOST) { flow_err(flow, "No support for forwarding UDP from %s to %s", pif_name(flow->f.pif[INISIDE]), pif_name(flow->f.pif[TGTSIDE])); goto cancel; } uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); flow_hash_insert(c, FLOW_SIDX(uflow, INISIDE)); flow_hash_insert(c, FLOW_SIDX(uflow, TGTSIDE)); FLOW_ACTIVATE(uflow); return FLOW_SIDX(uflow, TGTSIDE); cancel: flow_alloc_cancel(flow); return FLOW_SIDX_NONE; } /** * udp_tap_handler() - Handle packets from tap * @c: Execution context * @pif: pif on which the packet is arriving * @af: Address family, AF_INET or AF_INET6 * @saddr: Source address * @daddr: Destination address * @p: Pool of UDP packets, with UDP headers * @idx: Index of first packet to process * @now: Current timestamp * * Return: count of consumed packets * * #syscalls sendmmsg */ int udp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, const void *saddr, const void *daddr, const struct pool *p, int idx, const struct timespec *now) { const struct flowside *toside; struct mmsghdr mm[UIO_MAXIOV]; struct iovec m[UIO_MAXIOV]; union udp_epoll_ref uref; union sockaddr_inany sa; const struct udphdr *uh; struct udp_flow *uflow; int i, s, count = 0; flow_sidx_t sidx; in_port_t src; uint8_t topif; socklen_t sl; uh = packet_get(p, idx, 0, sizeof(*uh), NULL); if (!uh) return 1; /* The caller already checks that all the messages have the same source * and destination, so we can just take those from the first message. */ sidx = udp_flow_from_tap(c, pif, af, saddr, daddr, ntohs(uh->source), ntohs(uh->dest)); if (!(uflow = udp_at_sidx(sidx))) { char sstr[INANY_ADDRSTRLEN], dstr[INANY_ADDRSTRLEN]; debug("Dropping UDP packet without flow %s %s:%hu -> %s:%hu", pif_name(pif), inet_ntop(af, saddr, sstr, sizeof(sstr)), ntohs(uh->source), inet_ntop(af, daddr, dstr, sizeof(dstr)), ntohs(uh->dest)); return 1; } topif = uflow->f.pif[sidx.side]; toside = &uflow->f.side[sidx.side]; ASSERT(topif == PIF_HOST); uflow->ts = now->tv_sec; sockaddr_from_inany(&sa, &sl, &toside->eaddr, toside->eport, c->ifi6); src = toside->fport; uref.port = src; uref.pif = topif; if (sa.sa_family == AF_INET) { if ((s = udp_tap_map[V4][src].sock) < 0) { s = flowside_sock_l4(c, IPPROTO_UDP, topif, toside, uref.u32); if (s < 0) return p->count - idx; udp_tap_map[V4][src].sock = s; bitmap_set(udp_act[V4][UDP_ACT_TAP], src); } udp_tap_map[V4][src].ts = now->tv_sec; } else { if ((s = udp_tap_map[V6][src].sock) < 0) { uref.v6 = 1; s = flowside_sock_l4(c, IPPROTO_UDP, topif, toside, uref.u32); if (s < 0) return p->count - idx; udp_tap_map[V6][src].sock = s; bitmap_set(udp_act[V6][UDP_ACT_TAP], src); } udp_tap_map[V6][src].ts = now->tv_sec; } for (i = 0; i < (int)p->count - idx; i++) { struct udphdr *uh_send; size_t len; uh_send = packet_get(p, idx + i, 0, sizeof(*uh), &len); if (!uh_send) return p->count - idx; mm[i].msg_hdr.msg_name = &sa; mm[i].msg_hdr.msg_namelen = sl; if (len) { m[i].iov_base = (char *)(uh_send + 1); m[i].iov_len = len; mm[i].msg_hdr.msg_iov = m + i; mm[i].msg_hdr.msg_iovlen = 1; } else { mm[i].msg_hdr.msg_iov = NULL; mm[i].msg_hdr.msg_iovlen = 0; } mm[i].msg_hdr.msg_control = NULL; mm[i].msg_hdr.msg_controllen = 0; mm[i].msg_hdr.msg_flags = 0; count++; } count = sendmmsg(s, mm, count, MSG_NOSIGNAL); if (count < 0) return 1; return count; } /** * udp_sock_init() - Initialise listening sockets for a given port * @c: Execution context * @ns: In pasta mode, if set, bind with loopback address in namespace * @af: Address family to select a specific IP version, or AF_UNSPEC * @addr: Pointer to address for binding, NULL if not configured * @ifname: Name of interface to bind to, NULL if not configured * @port: Port, host order * * Return: 0 on (partial) success, negative error code on (complete) failure */ int udp_sock_init(const struct ctx *c, int ns, sa_family_t af, const void *addr, const char *ifname, in_port_t port) { union udp_epoll_ref uref = { .splice = (c->mode == MODE_PASTA), .orig = true, .port = port }; int s, r4 = FD_REF_MAX + 1, r6 = FD_REF_MAX + 1; if (ns) uref.pif = PIF_SPLICE; else uref.pif = PIF_HOST; if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) { uref.v6 = 0; if (!ns) { r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, ifname, port, uref.u32); udp_tap_map[V4][port].sock = s < 0 ? -1 : s; udp_splice_init[V4][port].sock = s < 0 ? -1 : s; } else { r4 = s = sock_l4(c, AF_INET, IPPROTO_UDP, &in4addr_loopback, ifname, port, uref.u32); udp_splice_ns[V4][port].sock = s < 0 ? -1 : s; } } if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) { uref.v6 = 1; if (!ns) { r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, ifname, port, uref.u32); udp_tap_map[V6][port].sock = s < 0 ? -1 : s; udp_splice_init[V6][port].sock = s < 0 ? -1 : s; } else { r6 = s = sock_l4(c, AF_INET6, IPPROTO_UDP, &in6addr_loopback, ifname, port, uref.u32); udp_splice_ns[V6][port].sock = s < 0 ? -1 : s; } } if (IN_INTERVAL(0, FD_REF_MAX, r4) || IN_INTERVAL(0, FD_REF_MAX, r6)) return 0; return r4 < 0 ? r4 : r6; } /** * udp_splice_iov_init() - Set up buffers and descriptors for recvmmsg/sendmmsg */ static void udp_splice_iov_init(void) { int i; for (i = 0; i < UDP_MAX_FRAMES; i++) { struct msghdr *mh4 = &udp4_mh_splice[i].msg_hdr; struct msghdr *mh6 = &udp6_mh_splice[i].msg_hdr; mh4->msg_name = &udp4_localname; mh4->msg_namelen = sizeof(udp4_localname); mh6->msg_name = &udp6_localname; mh6->msg_namelen = sizeof(udp6_localname); udp_iov_splice[i].iov_base = udp_payload[i].data; mh4->msg_iov = &udp_iov_splice[i]; mh6->msg_iov = &udp_iov_splice[i]; mh4->msg_iovlen = mh6->msg_iovlen = 1; } } /** * udp_timer_one() - Handler for timed events on one port * @c: Execution context * @v6: Set for IPv6 connections * @type: Socket type * @port: Port number, host order * @now: Current timestamp */ static void udp_timer_one(struct ctx *c, int v6, enum udp_act_type type, in_port_t port, const struct timespec *now) { struct udp_splice_port *sp; struct udp_tap_port *tp; int *sockp = NULL; switch (type) { case UDP_ACT_TAP: tp = &udp_tap_map[v6 ? V6 : V4][port]; if (now->tv_sec - tp->ts > UDP_CONN_TIMEOUT) { sockp = &tp->sock; tp->flags = 0; } break; case UDP_ACT_SPLICE_INIT: sp = &udp_splice_init[v6 ? V6 : V4][port]; if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) sockp = &sp->sock; break; case UDP_ACT_SPLICE_NS: sp = &udp_splice_ns[v6 ? V6 : V4][port]; if (now->tv_sec - sp->ts > UDP_CONN_TIMEOUT) sockp = &sp->sock; break; default: return; } if (sockp && *sockp >= 0) { int s = *sockp; *sockp = -1; epoll_ctl(c->epollfd, EPOLL_CTL_DEL, s, NULL); close(s); bitmap_clear(udp_act[v6 ? V6 : V4][type], port); } } /** * udp_port_rebind() - Rebind ports to match forward maps * @c: Execution context * @outbound: True to remap outbound forwards, otherwise inbound * * Must be called in namespace context if @outbound is true. */ static void udp_port_rebind(struct ctx *c, bool outbound) { const uint8_t *fmap = outbound ? c->udp.fwd_out.f.map : c->udp.fwd_in.f.map; const uint8_t *rmap = outbound ? c->udp.fwd_in.f.map : c->udp.fwd_out.f.map; struct udp_splice_port (*socks)[NUM_PORTS] = outbound ? udp_splice_ns : udp_splice_init; unsigned port; for (port = 0; port < NUM_PORTS; port++) { if (!bitmap_isset(fmap, port)) { if (socks[V4][port].sock >= 0) { close(socks[V4][port].sock); socks[V4][port].sock = -1; } if (socks[V6][port].sock >= 0) { close(socks[V6][port].sock); socks[V6][port].sock = -1; } continue; } /* Don't loop back our own ports */ if (bitmap_isset(rmap, port)) continue; if ((c->ifi4 && socks[V4][port].sock == -1) || (c->ifi6 && socks[V6][port].sock == -1)) udp_sock_init(c, outbound, AF_UNSPEC, NULL, NULL, port); } } /** * udp_port_rebind_outbound() - Rebind ports in namespace * @arg: Execution context * * Called with NS_CALL() * * Return: 0 */ static int udp_port_rebind_outbound(void *arg) { struct ctx *c = (struct ctx *)arg; ns_enter(c); udp_port_rebind(c, true); return 0; } bool udp_flow_timer(const struct ctx *c, const struct udp_flow *uflow, const struct timespec *now) { if (now->tv_sec - uflow->ts <= UDP_CONN_TIMEOUT) return false; flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); flow_hash_remove(c, FLOW_SIDX(uflow, TGTSIDE)); return true; } /** * udp_timer() - Scan activity bitmaps for ports with associated timed events * @c: Execution context * @now: Current timestamp */ void udp_timer(struct ctx *c, const struct timespec *now) { int n, t, v6 = 0; unsigned int i; long *word, tmp; if (c->mode == MODE_PASTA) { if (c->udp.fwd_out.f.mode == FWD_AUTO) { fwd_scan_ports_udp(&c->udp.fwd_out.f, &c->udp.fwd_in.f, &c->tcp.fwd_out, &c->tcp.fwd_in); NS_CALL(udp_port_rebind_outbound, c); } if (c->udp.fwd_in.f.mode == FWD_AUTO) { fwd_scan_ports_udp(&c->udp.fwd_in.f, &c->udp.fwd_out.f, &c->tcp.fwd_in, &c->tcp.fwd_out); udp_port_rebind(c, false); } } if (!c->ifi4) v6 = 1; v6: for (t = 0; t < UDP_ACT_TYPE_MAX; t++) { word = (long *)udp_act[v6 ? V6 : V4][t]; for (i = 0; i < ARRAY_SIZE(udp_act[0][0]); i += sizeof(long), word++) { tmp = *word; while ((n = ffsl(tmp))) { tmp &= ~(1UL << (n - 1)); udp_timer_one(c, v6, t, i * 8 + n - 1, now); } } } if (!v6 && c->ifi6) { v6 = 1; goto v6; } } /** * udp_init() - Initialise per-socket data, and sockets in namespace * @c: Execution context * * Return: 0 */ int udp_init(struct ctx *c) { udp_iov_init(c); udp_invert_portmap(&c->udp.fwd_in); udp_invert_portmap(&c->udp.fwd_out); if (c->mode == MODE_PASTA) { udp_splice_iov_init(); NS_CALL(udp_port_rebind_outbound, c); } return 0; }