On Thu, Apr 03, 2025 at 06:27:06PM -0400, Jon Maloy wrote: > Now that ICMP pass-through from socket-to-tap is in place, it is > easy to support UDP based traceroute functionality in direction > tap-to-socket. > > We fix that in this commit. > > Link: https://bugs.passt.top/show_bug.cgi?id=64 > Signed-off-by: Jon Maloy Reviewed-by: David Gibson One commont below. > --- > v2: - Using ancillary data instead of setsockopt to transfer outgoing > TTL. > - Support IPv6 > v3: - Storing ttl per packet instead of per flow. This may not be > elegant, but much less intrusive than changing the flow > criteria. This eliminates the need for the extra, flow-changing > patch we introduced in v2. > v4: - Going back to something similar to the original solution, but > storing current ttl in struct udp_flow, plus ensuring that all > packets in a struct tap4_l4_t/tap6_l4_t instance have the same > ttl. After input from David Gibson. > v5: - Some minor fixes after feedback from Stefano Brivio. > --- > packet.h | 2 ++ > tap.c | 17 +++++++++++++---- > udp.c | 19 ++++++++++++++++++- > udp.h | 3 ++- > udp_flow.c | 1 + > udp_flow.h | 4 +++- > 6 files changed, 39 insertions(+), 7 deletions(-) > > diff --git a/packet.h b/packet.h > index c94780a..e84e123 100644 > --- a/packet.h > +++ b/packet.h > @@ -11,6 +11,8 @@ > /* Maximum size of a single packet stored in pool, including headers */ > #define PACKET_MAX_LEN ((size_t)UINT16_MAX) > > +#define DEFAULT_TTL 64 This is still fixed, rather than either probing the sysctl or using getsockopt() to determine the initial value. I don't think we want to delay this further to change that, but it could be a reasonable follow up improvement. > + > /** > * struct pool - Generic pool of packets stored in a buffer > * @buf: Buffer storing packet descriptors, > diff --git a/tap.c b/tap.c > index 3a6fcbe..d630f6d 100644 > --- a/tap.c > +++ b/tap.c > @@ -559,6 +559,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); > * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4 > * @msgs: Count of messages in sequence > * @protocol: Protocol number > + * @ttl: Time to live > * @source: Source port > * @dest: Destination port > * @saddr: Source address > @@ -567,6 +568,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); > */ > static struct tap4_l4_t { > uint8_t protocol; > + uint8_t ttl; > > uint16_t source; > uint16_t dest; > @@ -586,6 +588,7 @@ static struct tap4_l4_t { > * @dest: Destination port > * @saddr: Source address > * @daddr: Destination address > + * @hop_limit: Hop limit > * @msg: Array of messages that can be handled in a single call > */ > static struct tap6_l4_t { > @@ -598,6 +601,8 @@ static struct tap6_l4_t { > struct in6_addr saddr; > struct in6_addr daddr; > > + uint8_t hop_limit; > + > struct pool_l4_t p; > } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */]; > > @@ -786,7 +791,8 @@ resume: > #define L4_MATCH(iph, uh, seq) \ > ((seq)->protocol == (iph)->protocol && \ > (seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \ > - (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr) > + (seq)->saddr.s_addr == (iph)->saddr && \ > + (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl) > > #define L4_SET(iph, uh, seq) \ > do { \ > @@ -795,6 +801,7 @@ resume: > (seq)->dest = (uh)->dest; \ > (seq)->saddr.s_addr = (iph)->saddr; \ > (seq)->daddr.s_addr = (iph)->daddr; \ > + (seq)->ttl = (iph)->ttl; \ > } while (0) > > if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV) > @@ -843,7 +850,7 @@ append: > for (k = 0; k < p->count; ) > k += udp_tap_handler(c, PIF_TAP, AF_INET, > &seq->saddr, &seq->daddr, > - p, k, now); > + seq->ttl, p, k, now); > } > } > > @@ -966,7 +973,8 @@ resume: > (seq)->dest == (uh)->dest && \ > (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \ > IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \ > - IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)) > + IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \ > + (seq)->hop_limit == (ip6h)->hop_limit) > > #define L4_SET(ip6h, proto, uh, seq) \ > do { \ > @@ -976,6 +984,7 @@ resume: > (seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \ > (seq)->saddr = *saddr; \ > (seq)->daddr = *daddr; \ > + (seq)->hop_limit = (ip6h)->hop_limit; \ > } while (0) > > if (seq && L4_MATCH(ip6h, proto, uh, seq) && > @@ -1026,7 +1035,7 @@ append: > for (k = 0; k < p->count; ) > k += udp_tap_handler(c, PIF_TAP, AF_INET6, > &seq->saddr, &seq->daddr, > - p, k, now); > + seq->hop_limit, p, k, now); > } > } > > diff --git a/udp.c b/udp.c > index 39431d7..618a4e2 100644 > --- a/udp.c > +++ b/udp.c > @@ -849,6 +849,7 @@ fail: > * @af: Address family, AF_INET or AF_INET6 > * @saddr: Source address > * @daddr: Destination address > + * @ttl: TTL or hop limit for packets to be sent in this call > * @p: Pool of UDP packets, with UDP headers > * @idx: Index of first packet to process > * @now: Current timestamp > @@ -859,7 +860,8 @@ fail: > */ > int udp_tap_handler(const struct ctx *c, uint8_t pif, > sa_family_t af, const void *saddr, const void *daddr, > - const struct pool *p, int idx, const struct timespec *now) > + uint8_t ttl, const struct pool *p, int idx, > + const struct timespec *now) > { > const struct flowside *toside; > struct mmsghdr mm[UIO_MAXIOV]; > @@ -938,6 +940,21 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, > mm[i].msg_hdr.msg_controllen = 0; > mm[i].msg_hdr.msg_flags = 0; > > + if (ttl != uflow->ttl[tosidx.sidei]) { > + uflow->ttl[tosidx.sidei] = ttl; > + if (af == AF_INET) { > + if (setsockopt(s, IPPROTO_IP, IP_TTL, > + &ttl, sizeof(ttl)) < 0) > + flow_perror(uflow, > + "setsockopt IP_TTL"); > + } else { > + if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT, > + &ttl, sizeof(ttl)) < 0) > + flow_perror(uflow, > + "setsockopt IPV6_HOPLIMIT"); > + } > + } > + > count++; > } > > diff --git a/udp.h b/udp.h > index de2df6d..a811475 100644 > --- a/udp.h > +++ b/udp.h > @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, > uint32_t events, const struct timespec *now); > int udp_tap_handler(const struct ctx *c, uint8_t pif, > sa_family_t af, const void *saddr, const void *daddr, > - const struct pool *p, int idx, const struct timespec *now); > + uint8_t ttl, const struct pool *p, int idx, > + const struct timespec *now); > int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, > const char *ifname, in_port_t port); > int udp_init(struct ctx *c); > diff --git a/udp_flow.c b/udp_flow.c > index bf4b896..39372c2 100644 > --- a/udp_flow.c > +++ b/udp_flow.c > @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, > uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp); > uflow->ts = now->tv_sec; > uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1; > + uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL; > > if (s_ini >= 0) { > /* When using auto port-scanning the listening port could go > diff --git a/udp_flow.h b/udp_flow.h > index 9a1b059..520de62 100644 > --- a/udp_flow.h > +++ b/udp_flow.h > @@ -8,11 +8,12 @@ > #define UDP_FLOW_H > > /** > - * struct udp - Descriptor for a flow of UDP packets > + * struct udp_flow - Descriptor for a flow of UDP packets > * @f: Generic flow information > * @closed: Flow is already closed > * @ts: Activity timestamp > * @s: Socket fd (or -1) for each side of the flow > + * @ttl: TTL or hop_limit for both sides > */ > struct udp_flow { > /* Must be first element */ > @@ -21,6 +22,7 @@ struct udp_flow { > bool closed :1; > time_t ts; > int s[SIDES]; > + uint8_t ttl[SIDES]; > }; > > struct udp_flow *udp_at_sidx(flow_sidx_t sidx); -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson