public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Jon Maloy <jmaloy@redhat.com>
To: sbrivio@redhat.com, dgibson@redhat.com, jmaloy@redhat.com,
	passt-dev@passt.top
Subject: [PATCH 2/2] udp: copy incoming packet TTL from socket to tap
Date: Wed, 23 Apr 2025 22:52:27 -0400	[thread overview]
Message-ID: <20250424025227.61697-3-jmaloy@redhat.com> (raw)
In-Reply-To: <20250424025227.61697-1-jmaloy@redhat.com>

We read the TTL/hop_limit from UDP packets arriving at outbound sockets
and convey to the value to the packets delivered to the internal peers
via the tap interface.

A prerequisite for this to work is that we eliminate the dual-stack
listener socket and create separate IPv4 and IPv6 sockets for each
bound UDP port. The extra memory required for that approach seems to
be a showstopper, so this patch is posted mostly as a documentation
of the work done, maybe to be applied some time in the future if
new conditions permit.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
 udp.c      | 69 ++++++++++++++++++++++++++++++++++++++++--------------
 udp_flow.c | 18 ++++++++++++++
 udp_vu.c   |  3 +--
 util.c     |  5 ++++
 4 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/udp.c b/udp.c
index 7cc050c..31b9203 100644
--- a/udp.c
+++ b/udp.c
@@ -181,7 +181,12 @@ enum udp_iov_idx {
 	UDP_NUM_IOVS,
 };
 
+struct udp_cmsg {
+	char buf[CMSG_SPACE(sizeof(int)) + CMSG_SPACE(sizeof(struct in6_pktinfo))];
+};
+
 /* IOVs and msghdr arrays for receiving datagrams from sockets */
+static struct udp_cmsg	udp_cmsg_recv		[UDP_MAX_FRAMES];
 static struct iovec	udp_iov_recv		[UDP_MAX_FRAMES];
 static struct mmsghdr	udp_mh_recv		[UDP_MAX_FRAMES];
 
@@ -194,6 +199,40 @@ static struct mmsghdr	udp_mh_splice		[UDP_MAX_FRAMES];
 /* IOVs for L2 frames */
 static struct iovec	udp_l2_iov		[UDP_MAX_FRAMES][UDP_NUM_IOVS];
 
+static uint8_t udp4_read_ttl(const struct msghdr *mhdr)
+{
+	struct msghdr *mh = (struct msghdr *)mhdr;
+	struct cmsghdr *cm;
+	int ttl = 0;
+
+	for (cm = CMSG_FIRSTHDR(mh); cm != NULL; cm = CMSG_NXTHDR(mh, cm)) {
+		if (cm->cmsg_level == IPPROTO_IP && cm->cmsg_type == IP_TTL) {
+			memcpy(&ttl, CMSG_DATA(cm), sizeof(ttl));
+			ttl = *(int *) CMSG_DATA(cm);
+			break;
+		}
+	}
+
+	return ttl;
+}
+
+static uint8_t udp6_read_hop_limit(const struct msghdr *mhdr)
+{
+	struct msghdr *mh = (struct msghdr *)mhdr;
+	struct cmsghdr *cm;
+	int hop_limit = 0;
+
+	for (cm = CMSG_FIRSTHDR(mh); cm != NULL; cm = CMSG_NXTHDR(mh, cm)) {
+		if (cm->cmsg_level == SOL_IPV6 &&
+		    cm->cmsg_type == IPV6_HOPLIMIT) {
+			memcpy(&hop_limit, CMSG_DATA(cm), sizeof(hop_limit));
+			break;
+		}
+	}
+
+	return hop_limit;
+}
+
 /**
  * udp_portmap_clear() - Clear UDP port map before configuration
  */
@@ -230,6 +269,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
 	struct udp_meta_t *meta = &udp_meta[i];
 	struct iovec *siov = &udp_iov_recv[i];
 	struct iovec *tiov = udp_l2_iov[i];
+	struct udp_cmsg *ucmsg = &udp_cmsg_recv[i];
 
 	*meta = (struct udp_meta_t) {
 		.ip4h = L2_BUF_IP4_INIT(IPPROTO_UDP),
@@ -243,6 +283,9 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
 
 	mh->msg_iov	= siov;
 	mh->msg_iovlen	= 1;
+
+	mh->msg_control = ucmsg;
+	mh->msg_controllen = sizeof(*ucmsg);
 }
 
 /**
@@ -271,8 +314,8 @@ static void udp_iov_init(const struct ctx *c)
  * Return: size of IPv4 payload (UDP header + data)
  */
 size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
-		       const struct flowside *toside, size_t dlen,
-		       uint8_t ttl, bool no_udp_csum)
+		       const struct flowside *toside,
+		       size_t dlen, uint8_t ttl, bool no_udp_csum)
 {
 	const struct in_addr *src = inany_v4(&toside->oaddr);
 	const struct in_addr *dst = inany_v4(&toside->eaddr);
@@ -285,6 +328,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
 	ip4h->ttl = ttl;
 	ip4h->daddr = dst->s_addr;
 	ip4h->saddr = src->s_addr;
+	ip4h->ttl = ttl;
 	ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, ttl, *src, *dst);
 
 	bp->uh.source = htons(toside->oport);
@@ -366,17 +410,20 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
 	size_t l4len;
 
 	if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
+		uint8_t hop_limit = udp6_read_hop_limit(&mmh[idx].msg_hdr);
+
 		l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
 					mmh[idx].msg_len,
-					DEFAULT_TTL, no_udp_csum);
+					hop_limit, no_udp_csum);
 		tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
 			       sizeof(udp6_eth_hdr));
 		(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
 		(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
 	} else {
+		uint8_t ttl = udp4_read_ttl(&mmh[idx].msg_hdr);
+
 		l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
-					mmh[idx].msg_len,
-					DEFAULT_TTL, no_udp_csum);
+					mmh[idx].msg_len, ttl, no_udp_csum);
 		tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
 			       sizeof(udp4_eth_hdr));
 		(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
@@ -1094,18 +1141,6 @@ int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
 
 	ASSERT(!c->no_udp);
 
-	if (!addr && c->ifi4 && c->ifi6 && !ns) {
-		int s;
-
-		/* Attempt to get a dual stack socket */
-		s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
-				NULL, ifname, port, uref.u32);
-		udp_splice_init[V4][port] = s < 0 ? -1 : s;
-		udp_splice_init[V6][port] = s < 0 ? -1 : s;
-		if (IN_INTERVAL(0, FD_REF_MAX, s))
-			return 0;
-	}
-
 	if ((!addr || inany_v4(addr)) && c->ifi4) {
 		if (!ns) {
 			r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
diff --git a/udp_flow.c b/udp_flow.c
index fea1cf3..8af0c1f 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -73,6 +73,8 @@ static int udp_flow_sock(const struct ctx *c,
 {
 	const struct flowside *side = &uflow->f.side[sidei];
 	uint8_t pif = uflow->f.pif[sidei];
+	int enable = 1;
+
 	union {
 		flow_sidx_t sidx;
 		uint32_t data;
@@ -91,6 +93,22 @@ static int udp_flow_sock(const struct ctx *c,
 		return rc;
 	}
 
+	if (pif == PIF_HOST) {
+		if (inany_v4(&side->oaddr)) {
+			if (setsockopt(s, IPPROTO_IP, IP_RECVTTL,
+				       &enable, sizeof(enable)) < 0) {
+				perror("setsockopt IP_RECVTTL");
+				exit(1);
+			}
+		} else {
+			if (setsockopt(s, SOL_IPV6, IPV6_RECVHOPLIMIT,
+				       &enable, sizeof(enable)) < 0) {
+				perror("setsockopt IPV6_RECVHOPLIMIT");
+				exit(1);
+			}
+		}
+	}
+
 	/* It's possible, if unlikely, that we could receive some packets in
 	 * between the bind() and connect() which may or may not be for this
 	 * flow.  Being UDP we could just discard them, but it's not ideal.
diff --git a/udp_vu.c b/udp_vu.c
index ef2257c..9871024 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -152,8 +152,7 @@ static size_t udp_vu_prepare(const struct ctx *c,
 
 		*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP);
 
-		l4len = udp_update_hdr4(iph, bp, toside, dlen,
-					DEFAULT_TTL, true);
+		l4len = udp_update_hdr4(iph, bp, toside, dlen, 255, true);
 	} else {
 		struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base);
 		struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base);
diff --git a/util.c b/util.c
index 62a6003..7c06029 100644
--- a/util.c
+++ b/util.c
@@ -111,13 +111,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	if (proto == IPPROTO_UDP) {
 		int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
 		int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
+		int ttlopt = af == AF_INET ? IP_RECVTTL : IPV6_RECVHOPLIMIT;
 		int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+		int ttlevel = af == AF_INET ? IPPROTO_IP : SOL_IPV6;
 
 		if (setsockopt(fd, level, recverr, &y, sizeof(y)))
 			die_perror("Failed to set RECVERR on socket %i", fd);
 
 		if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
 			die_perror("Failed to set PKTINFO on socket %i", fd);
+
+		if (setsockopt(fd, ttlevel, ttlopt, &y, sizeof(y)))
+			die_perror("Failed to set RECVTTL on socket %i", fd);
 	}
 
 	if (ifname && *ifname) {
-- 
@@ -111,13 +111,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
 	if (proto == IPPROTO_UDP) {
 		int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
 		int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
+		int ttlopt = af == AF_INET ? IP_RECVTTL : IPV6_RECVHOPLIMIT;
 		int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+		int ttlevel = af == AF_INET ? IPPROTO_IP : SOL_IPV6;
 
 		if (setsockopt(fd, level, recverr, &y, sizeof(y)))
 			die_perror("Failed to set RECVERR on socket %i", fd);
 
 		if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
 			die_perror("Failed to set PKTINFO on socket %i", fd);
+
+		if (setsockopt(fd, ttlevel, ttlopt, &y, sizeof(y)))
+			die_perror("Failed to set RECVTTL on socket %i", fd);
 	}
 
 	if (ifname && *ifname) {
-- 
2.48.1


      parent reply	other threads:[~2025-04-24  2:52 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-24  2:52 [PATCH 0/2] udp: copy ttl or hop limit from socket to tap Jon Maloy
2025-04-24  2:52 ` [PATCH 1/2] make ttl parametrized Jon Maloy
2025-04-24  2:52 ` Jon Maloy [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250424025227.61697-3-jmaloy@redhat.com \
    --to=jmaloy@redhat.com \
    --cc=dgibson@redhat.com \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).