From: Jon Maloy <jmaloy@redhat.com>
To: sbrivio@redhat.com, dgibson@redhat.com, jmaloy@redhat.com,
passt-dev@passt.top
Subject: [PATCH 2/2] udp: copy incoming packet TTL from socket to tap
Date: Wed, 23 Apr 2025 22:52:27 -0400 [thread overview]
Message-ID: <20250424025227.61697-3-jmaloy@redhat.com> (raw)
In-Reply-To: <20250424025227.61697-1-jmaloy@redhat.com>
We read the TTL/hop_limit from UDP packets arriving at outbound sockets
and convey to the value to the packets delivered to the internal peers
via the tap interface.
A prerequisite for this to work is that we eliminate the dual-stack
listener socket and create separate IPv4 and IPv6 sockets for each
bound UDP port. The extra memory required for that approach seems to
be a showstopper, so this patch is posted mostly as a documentation
of the work done, maybe to be applied some time in the future if
new conditions permit.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
udp.c | 69 ++++++++++++++++++++++++++++++++++++++++--------------
udp_flow.c | 18 ++++++++++++++
udp_vu.c | 3 +--
util.c | 5 ++++
4 files changed, 76 insertions(+), 19 deletions(-)
diff --git a/udp.c b/udp.c
index 7cc050c..31b9203 100644
--- a/udp.c
+++ b/udp.c
@@ -181,7 +181,12 @@ enum udp_iov_idx {
UDP_NUM_IOVS,
};
+struct udp_cmsg {
+ char buf[CMSG_SPACE(sizeof(int)) + CMSG_SPACE(sizeof(struct in6_pktinfo))];
+};
+
/* IOVs and msghdr arrays for receiving datagrams from sockets */
+static struct udp_cmsg udp_cmsg_recv [UDP_MAX_FRAMES];
static struct iovec udp_iov_recv [UDP_MAX_FRAMES];
static struct mmsghdr udp_mh_recv [UDP_MAX_FRAMES];
@@ -194,6 +199,40 @@ static struct mmsghdr udp_mh_splice [UDP_MAX_FRAMES];
/* IOVs for L2 frames */
static struct iovec udp_l2_iov [UDP_MAX_FRAMES][UDP_NUM_IOVS];
+static uint8_t udp4_read_ttl(const struct msghdr *mhdr)
+{
+ struct msghdr *mh = (struct msghdr *)mhdr;
+ struct cmsghdr *cm;
+ int ttl = 0;
+
+ for (cm = CMSG_FIRSTHDR(mh); cm != NULL; cm = CMSG_NXTHDR(mh, cm)) {
+ if (cm->cmsg_level == IPPROTO_IP && cm->cmsg_type == IP_TTL) {
+ memcpy(&ttl, CMSG_DATA(cm), sizeof(ttl));
+ ttl = *(int *) CMSG_DATA(cm);
+ break;
+ }
+ }
+
+ return ttl;
+}
+
+static uint8_t udp6_read_hop_limit(const struct msghdr *mhdr)
+{
+ struct msghdr *mh = (struct msghdr *)mhdr;
+ struct cmsghdr *cm;
+ int hop_limit = 0;
+
+ for (cm = CMSG_FIRSTHDR(mh); cm != NULL; cm = CMSG_NXTHDR(mh, cm)) {
+ if (cm->cmsg_level == SOL_IPV6 &&
+ cm->cmsg_type == IPV6_HOPLIMIT) {
+ memcpy(&hop_limit, CMSG_DATA(cm), sizeof(hop_limit));
+ break;
+ }
+ }
+
+ return hop_limit;
+}
+
/**
* udp_portmap_clear() - Clear UDP port map before configuration
*/
@@ -230,6 +269,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
struct udp_meta_t *meta = &udp_meta[i];
struct iovec *siov = &udp_iov_recv[i];
struct iovec *tiov = udp_l2_iov[i];
+ struct udp_cmsg *ucmsg = &udp_cmsg_recv[i];
*meta = (struct udp_meta_t) {
.ip4h = L2_BUF_IP4_INIT(IPPROTO_UDP),
@@ -243,6 +283,9 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
mh->msg_iov = siov;
mh->msg_iovlen = 1;
+
+ mh->msg_control = ucmsg;
+ mh->msg_controllen = sizeof(*ucmsg);
}
/**
@@ -271,8 +314,8 @@ static void udp_iov_init(const struct ctx *c)
* Return: size of IPv4 payload (UDP header + data)
*/
size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
- const struct flowside *toside, size_t dlen,
- uint8_t ttl, bool no_udp_csum)
+ const struct flowside *toside,
+ size_t dlen, uint8_t ttl, bool no_udp_csum)
{
const struct in_addr *src = inany_v4(&toside->oaddr);
const struct in_addr *dst = inany_v4(&toside->eaddr);
@@ -285,6 +328,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
ip4h->ttl = ttl;
ip4h->daddr = dst->s_addr;
ip4h->saddr = src->s_addr;
+ ip4h->ttl = ttl;
ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, ttl, *src, *dst);
bp->uh.source = htons(toside->oport);
@@ -366,17 +410,20 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
size_t l4len;
if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
+ uint8_t hop_limit = udp6_read_hop_limit(&mmh[idx].msg_hdr);
+
l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
mmh[idx].msg_len,
- DEFAULT_TTL, no_udp_csum);
+ hop_limit, no_udp_csum);
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
sizeof(udp6_eth_hdr));
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
} else {
+ uint8_t ttl = udp4_read_ttl(&mmh[idx].msg_hdr);
+
l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
- mmh[idx].msg_len,
- DEFAULT_TTL, no_udp_csum);
+ mmh[idx].msg_len, ttl, no_udp_csum);
tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
sizeof(udp4_eth_hdr));
(*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
@@ -1094,18 +1141,6 @@ int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
ASSERT(!c->no_udp);
- if (!addr && c->ifi4 && c->ifi6 && !ns) {
- int s;
-
- /* Attempt to get a dual stack socket */
- s = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
- NULL, ifname, port, uref.u32);
- udp_splice_init[V4][port] = s < 0 ? -1 : s;
- udp_splice_init[V6][port] = s < 0 ? -1 : s;
- if (IN_INTERVAL(0, FD_REF_MAX, s))
- return 0;
- }
-
if ((!addr || inany_v4(addr)) && c->ifi4) {
if (!ns) {
r4 = pif_sock_l4(c, EPOLL_TYPE_UDP_LISTEN, PIF_HOST,
diff --git a/udp_flow.c b/udp_flow.c
index fea1cf3..8af0c1f 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -73,6 +73,8 @@ static int udp_flow_sock(const struct ctx *c,
{
const struct flowside *side = &uflow->f.side[sidei];
uint8_t pif = uflow->f.pif[sidei];
+ int enable = 1;
+
union {
flow_sidx_t sidx;
uint32_t data;
@@ -91,6 +93,22 @@ static int udp_flow_sock(const struct ctx *c,
return rc;
}
+ if (pif == PIF_HOST) {
+ if (inany_v4(&side->oaddr)) {
+ if (setsockopt(s, IPPROTO_IP, IP_RECVTTL,
+ &enable, sizeof(enable)) < 0) {
+ perror("setsockopt IP_RECVTTL");
+ exit(1);
+ }
+ } else {
+ if (setsockopt(s, SOL_IPV6, IPV6_RECVHOPLIMIT,
+ &enable, sizeof(enable)) < 0) {
+ perror("setsockopt IPV6_RECVHOPLIMIT");
+ exit(1);
+ }
+ }
+ }
+
/* It's possible, if unlikely, that we could receive some packets in
* between the bind() and connect() which may or may not be for this
* flow. Being UDP we could just discard them, but it's not ideal.
diff --git a/udp_vu.c b/udp_vu.c
index ef2257c..9871024 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -152,8 +152,7 @@ static size_t udp_vu_prepare(const struct ctx *c,
*iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP);
- l4len = udp_update_hdr4(iph, bp, toside, dlen,
- DEFAULT_TTL, true);
+ l4len = udp_update_hdr4(iph, bp, toside, dlen, 255, true);
} else {
struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base);
struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base);
diff --git a/util.c b/util.c
index 62a6003..7c06029 100644
--- a/util.c
+++ b/util.c
@@ -111,13 +111,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
if (proto == IPPROTO_UDP) {
int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
+ int ttlopt = af == AF_INET ? IP_RECVTTL : IPV6_RECVHOPLIMIT;
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+ int ttlevel = af == AF_INET ? IPPROTO_IP : SOL_IPV6;
if (setsockopt(fd, level, recverr, &y, sizeof(y)))
die_perror("Failed to set RECVERR on socket %i", fd);
if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
die_perror("Failed to set PKTINFO on socket %i", fd);
+
+ if (setsockopt(fd, ttlevel, ttlopt, &y, sizeof(y)))
+ die_perror("Failed to set RECVTTL on socket %i", fd);
}
if (ifname && *ifname) {
--
@@ -111,13 +111,18 @@ int sock_l4_sa(const struct ctx *c, enum epoll_type type,
if (proto == IPPROTO_UDP) {
int pktinfo = af == AF_INET ? IP_PKTINFO : IPV6_RECVPKTINFO;
int recverr = af == AF_INET ? IP_RECVERR : IPV6_RECVERR;
+ int ttlopt = af == AF_INET ? IP_RECVTTL : IPV6_RECVHOPLIMIT;
int level = af == AF_INET ? IPPROTO_IP : IPPROTO_IPV6;
+ int ttlevel = af == AF_INET ? IPPROTO_IP : SOL_IPV6;
if (setsockopt(fd, level, recverr, &y, sizeof(y)))
die_perror("Failed to set RECVERR on socket %i", fd);
if (setsockopt(fd, level, pktinfo, &y, sizeof(y)))
die_perror("Failed to set PKTINFO on socket %i", fd);
+
+ if (setsockopt(fd, ttlevel, ttlopt, &y, sizeof(y)))
+ die_perror("Failed to set RECVTTL on socket %i", fd);
}
if (ifname && *ifname) {
--
2.48.1
prev parent reply other threads:[~2025-04-24 2:52 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-24 2:52 [PATCH 0/2] udp: copy ttl or hop limit from socket to tap Jon Maloy
2025-04-24 2:52 ` [PATCH 1/2] make ttl parametrized Jon Maloy
2025-04-24 2:52 ` Jon Maloy [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250424025227.61697-3-jmaloy@redhat.com \
--to=jmaloy@redhat.com \
--cc=dgibson@redhat.com \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).