* [PATCH v4] udp: support traceroute
@ 2025-04-03 2:22 Jon Maloy
2025-04-03 15:48 ` Stefano Brivio
0 siblings, 1 reply; 4+ messages in thread
From: Jon Maloy @ 2025-04-03 2:22 UTC (permalink / raw)
To: passt-dev, sbrivio, lvivier, dgibson, jmaloy
Now that ICMP pass-through from socket-to-tap is in place, it is
easy to support UDP based traceroute functionality in direction
tap-to-socket.
We fix that in this commit.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v2: - Using ancillary data instead of setsockopt to transfer outgoing
TTL.
- Support IPv6
v3: - Storing ttl per packet instead of per flow. This may not be
elegant, but much less intrusive than changing the flow
criteria. This eliminates the need for the extra, flow-changing
patch we introduced in v2.
v4: - Going back to something similar to the original solution, but
storing current ttl in struct udp_flow, plus ensuring that all
packets in a struct tap4_l4_t/tap6_l4_t instance, have the same
ttl. After input from David Gibson.
---
packet.h | 2 ++
tap.c | 18 ++++++++++++++----
udp.c | 17 ++++++++++++++++-
udp.h | 3 ++-
udp_flow.c | 1 +
udp_flow.h | 1 +
6 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/packet.h b/packet.h
index c94780a..e84e123 100644
--- a/packet.h
+++ b/packet.h
@@ -11,6 +11,8 @@
/* Maximum size of a single packet stored in pool, including headers */
#define PACKET_MAX_LEN ((size_t)UINT16_MAX)
+#define DEFAULT_TTL 64
+
/**
* struct pool - Generic pool of packets stored in a buffer
* @buf: Buffer storing packet descriptors,
diff --git a/tap.c b/tap.c
index 3a6fcbe..e65d592 100644
--- a/tap.c
+++ b/tap.c
@@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
* @dest: Destination port
* @saddr: Source address
* @daddr: Destination address
+ * @ttl: Time to live
* @msg: Array of messages that can be handled in a single call
*/
static struct tap4_l4_t {
@@ -574,6 +575,8 @@ static struct tap4_l4_t {
struct in_addr saddr;
struct in_addr daddr;
+ uint8_t ttl;
+
struct pool_l4_t p;
} tap4_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
@@ -586,6 +589,7 @@ static struct tap4_l4_t {
* @dest: Destination port
* @saddr: Source address
* @daddr: Destination address
+ * @hop_limit: Hop limit
* @msg: Array of messages that can be handled in a single call
*/
static struct tap6_l4_t {
@@ -598,6 +602,8 @@ static struct tap6_l4_t {
struct in6_addr saddr;
struct in6_addr daddr;
+ uint8_t hop_limit;
+
struct pool_l4_t p;
} tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
@@ -786,7 +792,8 @@ resume:
#define L4_MATCH(iph, uh, seq) \
((seq)->protocol == (iph)->protocol && \
(seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \
- (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
+ (seq)->saddr.s_addr == (iph)->saddr && \
+ (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
#define L4_SET(iph, uh, seq) \
do { \
@@ -795,6 +802,7 @@ resume:
(seq)->dest = (uh)->dest; \
(seq)->saddr.s_addr = (iph)->saddr; \
(seq)->daddr.s_addr = (iph)->daddr; \
+ (seq)->ttl = (iph)->ttl; \
} while (0)
if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
@@ -843,7 +851,7 @@ append:
for (k = 0; k < p->count; )
k += udp_tap_handler(c, PIF_TAP, AF_INET,
&seq->saddr, &seq->daddr,
- p, k, now);
+ seq->ttl, p, k, now);
}
}
@@ -966,7 +974,8 @@ resume:
(seq)->dest == (uh)->dest && \
(seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \
IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \
- IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
+ IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \
+ (seq)->hop_limit == (ip6h)->hop_limit)
#define L4_SET(ip6h, proto, uh, seq) \
do { \
@@ -976,6 +985,7 @@ resume:
(seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \
(seq)->saddr = *saddr; \
(seq)->daddr = *daddr; \
+ (seq)->hop_limit = (ip6h)->hop_limit; \
} while (0)
if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
@@ -1026,7 +1036,7 @@ append:
for (k = 0; k < p->count; )
k += udp_tap_handler(c, PIF_TAP, AF_INET6,
&seq->saddr, &seq->daddr,
- p, k, now);
+ seq->hop_limit, p, k, now);
}
}
diff --git a/udp.c b/udp.c
index 39431d7..bc93292 100644
--- a/udp.c
+++ b/udp.c
@@ -849,6 +849,7 @@ fail:
* @af: Address family, AF_INET or AF_INET6
* @saddr: Source address
* @daddr: Destination address
+ * @ttl: TTL or hop limit for packets to be sent in this call
* @p: Pool of UDP packets, with UDP headers
* @idx: Index of first packet to process
* @now: Current timestamp
@@ -859,7 +860,8 @@ fail:
*/
int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr,
- const struct pool *p, int idx, const struct timespec *now)
+ uint8_t ttl, const struct pool *p, int idx,
+ const struct timespec *now)
{
const struct flowside *toside;
struct mmsghdr mm[UIO_MAXIOV];
@@ -938,6 +940,19 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
mm[i].msg_hdr.msg_controllen = 0;
mm[i].msg_hdr.msg_flags = 0;
+ if (ttl != uflow->ttl[tosidx.sidei]) {
+ uflow->ttl[tosidx.sidei] = ttl;
+ if (af == AF_INET) {
+ if (setsockopt(s, IPPROTO_IP, IP_TTL,
+ &ttl, sizeof(ttl)) < 0)
+ perror("setsockopt (IP_TTL)");
+ } else {
+ if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
+ &ttl, sizeof(ttl)) < 0)
+ perror("setsockopt (IP_TTL)");
+ }
+ }
+
count++;
}
diff --git a/udp.h b/udp.h
index de2df6d..041fad4 100644
--- a/udp.h
+++ b/udp.h
@@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
uint32_t events, const struct timespec *now);
int udp_tap_handler(const struct ctx *c, uint8_t pif,
sa_family_t af, const void *saddr, const void *daddr,
- const struct pool *p, int idx, const struct timespec *now);
+ uint8_t ttl, const struct pool *p, int idx,
+ const struct timespec *now);
int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
const char *ifname, in_port_t port);
int udp_init(struct ctx *c);
diff --git a/udp_flow.c b/udp_flow.c
index bf4b896..39372c2 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
uflow->ts = now->tv_sec;
uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
+ uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
if (s_ini >= 0) {
/* When using auto port-scanning the listening port could go
diff --git a/udp_flow.h b/udp_flow.h
index 9a1b059..606ac08 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -21,6 +21,7 @@ struct udp_flow {
bool closed :1;
time_t ts;
int s[SIDES];
+ uint8_t ttl[SIDES];
};
struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
--
@@ -21,6 +21,7 @@ struct udp_flow {
bool closed :1;
time_t ts;
int s[SIDES];
+ uint8_t ttl[SIDES];
};
struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
--
2.48.1
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH v4] udp: support traceroute
2025-04-03 2:22 [PATCH v4] udp: support traceroute Jon Maloy
@ 2025-04-03 15:48 ` Stefano Brivio
2025-04-03 20:27 ` Jon Maloy
0 siblings, 1 reply; 4+ messages in thread
From: Stefano Brivio @ 2025-04-03 15:48 UTC (permalink / raw)
To: Jon Maloy; +Cc: passt-dev, lvivier, dgibson
The implementation looks solid to me, a list of nits (or a bit
more) below.
By the way, I don't think you need to Cc: people who are already on
this list unless you specifically want their attention.
On Wed, 2 Apr 2025 22:22:29 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> Now that ICMP pass-through from socket-to-tap is in place, it is
> easy to support UDP based traceroute functionality in direction
> tap-to-socket.
>
> We fix that in this commit.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
This fixes https://bugs.passt.top/show_bug.cgi?id=64 ("Link:" tag) if I
understood correctly.
> ---
> v2: - Using ancillary data instead of setsockopt to transfer outgoing
> TTL.
> - Support IPv6
> v3: - Storing ttl per packet instead of per flow. This may not be
> elegant, but much less intrusive than changing the flow
> criteria. This eliminates the need for the extra, flow-changing
> patch we introduced in v2.
> v4: - Going back to something similar to the original solution, but
> storing current ttl in struct udp_flow, plus ensuring that all
> packets in a struct tap4_l4_t/tap6_l4_t instance, have the same
> ttl. After input from David Gibson.
> ---
> packet.h | 2 ++
> tap.c | 18 ++++++++++++++----
> udp.c | 17 ++++++++++++++++-
> udp.h | 3 ++-
> udp_flow.c | 1 +
> udp_flow.h | 1 +
> 6 files changed, 36 insertions(+), 6 deletions(-)
>
> diff --git a/packet.h b/packet.h
> index c94780a..e84e123 100644
> --- a/packet.h
> +++ b/packet.h
> @@ -11,6 +11,8 @@
> /* Maximum size of a single packet stored in pool, including headers */
> #define PACKET_MAX_LEN ((size_t)UINT16_MAX)
>
> +#define DEFAULT_TTL 64
If I understood correctly, David's comment to this on v3:
https://archives.passt.top/passt-dev/Z-om3Ey-HR1Hj8UH@zatzit/
was meant to imply that, as the default value can be changed via
sysctl, the value set via sysctl could be read at start-up. I'm fine
with 64 as well, by the way, with a slight preference for reading the
value via sysctl.
All this might go away, though, please read the comment to
udp_flow_new() below, first.
> +
> /**
> * struct pool - Generic pool of packets stored in a buffer
> * @buf: Buffer storing packet descriptors,
> diff --git a/tap.c b/tap.c
> index 3a6fcbe..e65d592 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
> * @dest: Destination port
> * @saddr: Source address
> * @daddr: Destination address
> + * @ttl: Time to live
> * @msg: Array of messages that can be handled in a single call
> */
> static struct tap4_l4_t {
> @@ -574,6 +575,8 @@ static struct tap4_l4_t {
> struct in_addr saddr;
> struct in_addr daddr;
>
> + uint8_t ttl;
If you move this after 'protocol' you save 4 or 8 bytes depending on
the architecture and, perhaps more importantly, with 64-byte cachelines,
you can fit the set of fields involved in the L4_MATCH() comparison
four times instead of three. If you have a look with pahole(1):
--
struct tap4_l4_t {
uint8_t protocol; /* 0 1 */
/* XXX 1 byte hole, try to pack */
uint16_t source; /* 2 2 */
uint16_t dest; /* 4 2 */
/* XXX 2 bytes hole, try to pack */
struct in_addr saddr; /* 8 4 */
struct in_addr daddr; /* 12 4 */
uint8_t ttl; /* 16 1 */
/* XXX 7 bytes hole, try to pack */
...
}
--
becomes:
--
struct tap4_l4_t {
uint8_t protocol; /* 0 1 */
uint8_t ttl; /* 1 1 */
uint16_t source; /* 2 2 */
uint16_t dest; /* 4 2 */
/* XXX 2 bytes hole, try to pack */
struct in_addr saddr; /* 8 4 */
struct in_addr daddr; /* 12 4 */
...
}
--
...if you move it, please don't forget to update the comment to the
struct.
> +
> struct pool_l4_t p;
> } tap4_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
>
> @@ -586,6 +589,7 @@ static struct tap4_l4_t {
> * @dest: Destination port
> * @saddr: Source address
> * @daddr: Destination address
> + * @hop_limit: Hop limit
> * @msg: Array of messages that can be handled in a single call
> */
> static struct tap6_l4_t {
> @@ -598,6 +602,8 @@ static struct tap6_l4_t {
> struct in6_addr saddr;
> struct in6_addr daddr;
>
> + uint8_t hop_limit;
Here, instead, it doesn't matter, because 'p' starts at 48 bytes anyway,
and we compare the flow label too.
> +
> struct pool_l4_t p;
> } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
>
> @@ -786,7 +792,8 @@ resume:
> #define L4_MATCH(iph, uh, seq) \
> ((seq)->protocol == (iph)->protocol && \
> (seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \
> - (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
> + (seq)->saddr.s_addr == (iph)->saddr && \
> + (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
>
> #define L4_SET(iph, uh, seq) \
> do { \
> @@ -795,6 +802,7 @@ resume:
> (seq)->dest = (uh)->dest; \
> (seq)->saddr.s_addr = (iph)->saddr; \
> (seq)->daddr.s_addr = (iph)->daddr; \
> + (seq)->ttl = (iph)->ttl; \
> } while (0)
>
> if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
> @@ -843,7 +851,7 @@ append:
> for (k = 0; k < p->count; )
> k += udp_tap_handler(c, PIF_TAP, AF_INET,
> &seq->saddr, &seq->daddr,
> - p, k, now);
> + seq->ttl, p, k, now);
> }
> }
>
> @@ -966,7 +974,8 @@ resume:
> (seq)->dest == (uh)->dest && \
> (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \
> IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \
> - IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
> + IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \
> + (seq)->hop_limit == (ip6h)->hop_limit)
>
> #define L4_SET(ip6h, proto, uh, seq) \
> do { \
> @@ -976,6 +985,7 @@ resume:
> (seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \
> (seq)->saddr = *saddr; \
> (seq)->daddr = *daddr; \
> + (seq)->hop_limit = (ip6h)->hop_limit; \
> } while (0)
>
> if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
> @@ -1026,7 +1036,7 @@ append:
> for (k = 0; k < p->count; )
> k += udp_tap_handler(c, PIF_TAP, AF_INET6,
> &seq->saddr, &seq->daddr,
> - p, k, now);
> + seq->hop_limit, p, k, now);
> }
> }
>
> diff --git a/udp.c b/udp.c
> index 39431d7..bc93292 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -849,6 +849,7 @@ fail:
> * @af: Address family, AF_INET or AF_INET6
> * @saddr: Source address
> * @daddr: Destination address
> + * @ttl: TTL or hop limit for packets to be sent in this call
> * @p: Pool of UDP packets, with UDP headers
> * @idx: Index of first packet to process
> * @now: Current timestamp
> @@ -859,7 +860,8 @@ fail:
> */
> int udp_tap_handler(const struct ctx *c, uint8_t pif,
> sa_family_t af, const void *saddr, const void *daddr,
> - const struct pool *p, int idx, const struct timespec *now)
> + uint8_t ttl, const struct pool *p, int idx,
> + const struct timespec *now)
> {
> const struct flowside *toside;
> struct mmsghdr mm[UIO_MAXIOV];
> @@ -938,6 +940,19 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
> mm[i].msg_hdr.msg_controllen = 0;
> mm[i].msg_hdr.msg_flags = 0;
>
> + if (ttl != uflow->ttl[tosidx.sidei]) {
> + uflow->ttl[tosidx.sidei] = ttl;
> + if (af == AF_INET) {
> + if (setsockopt(s, IPPROTO_IP, IP_TTL,
> + &ttl, sizeof(ttl)) < 0)
> + perror("setsockopt (IP_TTL)");
This would print to file descriptor 2 even if it's a socket. It should
be err_perror() instead, but now we also have flow_perror() which
prints flow index and type, given 'uflow' here, say:
flow_perror(uflow, "IP_TTL setsockopt");
> + } else {
> + if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
> + &ttl, sizeof(ttl)) < 0)
> + perror("setsockopt (IP_TTL)");
...and this is IPV6_HOPLIMIT, not IP_TTL, so perhaps:
flow_perror(uflow,
"setsockopt IPV6_HOPLIMIT");
> + }
> + }
> +
> count++;
> }
>
> diff --git a/udp.h b/udp.h
> index de2df6d..041fad4 100644
> --- a/udp.h
> +++ b/udp.h
> @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
> uint32_t events, const struct timespec *now);
> int udp_tap_handler(const struct ctx *c, uint8_t pif,
> sa_family_t af, const void *saddr, const void *daddr,
> - const struct pool *p, int idx, const struct timespec *now);
> + uint8_t ttl, const struct pool *p, int idx,
Excess whitespace beetween 'uint8_t' and 'ttl'.
> + const struct timespec *now);
> int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
> const char *ifname, in_port_t port);
> int udp_init(struct ctx *c);
> diff --git a/udp_flow.c b/udp_flow.c
> index bf4b896..39372c2 100644
> --- a/udp_flow.c
> +++ b/udp_flow.c
> @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
> uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
> uflow->ts = now->tv_sec;
> uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
> + uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
By the way, instead of using a default value, what about fetching the
current value with getsockopt()?
One additional system call per UDP flow doesn't feel like a lot of
overhead, and we can be sure it's correct, no matter if the user
configures a different value before or after we start.
>
> if (s_ini >= 0) {
> /* When using auto port-scanning the listening port could go
> diff --git a/udp_flow.h b/udp_flow.h
> index 9a1b059..606ac08 100644
> --- a/udp_flow.h
> +++ b/udp_flow.h
> @@ -21,6 +21,7 @@ struct udp_flow {
> bool closed :1;
> time_t ts;
> int s[SIDES];
> + uint8_t ttl[SIDES];
Ths should be added to the struct comment above, which, by mistake,
seems to refer to 'struct udp' by the way (I would fix that right away
while at it...).
> };
>
> struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
--
Stefano
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v4] udp: support traceroute
2025-04-03 15:48 ` Stefano Brivio
@ 2025-04-03 20:27 ` Jon Maloy
2025-04-03 23:31 ` David Gibson
0 siblings, 1 reply; 4+ messages in thread
From: Jon Maloy @ 2025-04-03 20:27 UTC (permalink / raw)
To: passt-dev
On 2025-04-03 11:48, Stefano Brivio wrote:
> The implementation looks solid to me, a list of nits (or a bit
> more) below.
>
> By the way, I don't think you need to Cc: people who are already on
> this list unless you specifically want their attention.
>
> On Wed, 2 Apr 2025 22:22:29 -0400
> Jon Maloy <jmaloy@redhat.com> wrote:
>
>> Now that ICMP pass-through from socket-to-tap is in place, it is
>> easy to support UDP based traceroute functionality in direction
>> tap-to-socket.
>>
>> We fix that in this commit.
>>
>> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> This fixes https://bugs.passt.top/show_bug.cgi?id=64 ("Link:" tag) if I
> understood correctly.
>
>> ---
>> v2: - Using ancillary data instead of setsockopt to transfer outgoing
>> TTL.
>> - Support IPv6
>> v3: - Storing ttl per packet instead of per flow. This may not be
>> elegant, but much less intrusive than changing the flow
[...]
>> @@ -11,6 +11,8 @@
>> /* Maximum size of a single packet stored in pool, including headers */
>> #define PACKET_MAX_LEN ((size_t)UINT16_MAX)
>>
>> +#define DEFAULT_TTL 64
>
> If I understood correctly, David's comment to this on v3:
>
> https://archives.passt.top/passt-dev/Z-om3Ey-HR1Hj8UH@zatzit/
>
> was meant to imply that, as the default value can be changed via
> sysctl, the value set via sysctl could be read at start-up. I'm fine
> with 64 as well, by the way, with a slight preference for reading the
> value via sysctl.
I don't think the local host/container setting will have any effect
if the sending guest is a VM. The benefit is of this is dubious.
>
> All this might go away, though, please read the comment to
> udp_flow_new() below, first.
>
>> +
>> /**
>> * struct pool - Generic pool of packets stored in a buffer
>> * @buf: Buffer storing packet descriptors,
>> diff --git a/tap.c b/tap.c
>> index 3a6fcbe..e65d592 100644
>> --- a/tap.c
>> +++ b/tap.c
>> @@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
>> * @dest: Destination port
>> * @saddr: Source address
>> * @daddr: Destination address
>> + * @ttl: Time to live
>> * @msg: Array of messages that can be handled in a single call
>> */
>> static struct tap4_l4_t {
>> @@ -574,6 +575,8 @@ static struct tap4_l4_t {
>> struct in_addr saddr;
>> struct in_addr daddr;
>>
>> + uint8_t ttl;
>
> If you move this after 'protocol' you save 4 or 8 bytes depending on
> the architecture and, perhaps more importantly, with 64-byte cachelines,
> you can fit the set of fields involved in the L4_MATCH() comparison
> four times instead of three. If you have a look with pahole(1):
>
> --
> struct tap4_l4_t {
> uint8_t protocol; /* 0 1 */
>
> /* XXX 1 byte hole, try to pack */
>
> uint16_t source; /* 2 2 */
> uint16_t dest; /* 4 2 */
>
> /* XXX 2 bytes hole, try to pack */
>
> struct in_addr saddr; /* 8 4 */
> struct in_addr daddr; /* 12 4 */
> uint8_t ttl; /* 16 1 */
>
> /* XXX 7 bytes hole, try to pack */
>
> ...
> }
> --
>
> becomes:
>
> --
> struct tap4_l4_t {
> uint8_t protocol; /* 0 1 */
> uint8_t ttl; /* 1 1 */
> uint16_t source; /* 2 2 */
> uint16_t dest; /* 4 2 */
>
> /* XXX 2 bytes hole, try to pack */
>
> struct in_addr saddr; /* 8 4 */
> struct in_addr daddr; /* 12 4 */
> ...
> }
Good point. I didn't notice.
> --
>
> ...if you move it, please don't forget to update the comment to the
> struct.
>
>> +
>> struct pool_l4_t p;
[...]
>> const struct flowside *toside;
>> struct mmsghdr mm[UIO_MAXIOV];
>> @@ -938,6 +940,19 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
>> mm[i].msg_hdr.msg_controllen = 0;
>> mm[i].msg_hdr.msg_flags = 0;
>>
>> + if (ttl != uflow->ttl[tosidx.sidei]) {
>> + uflow->ttl[tosidx.sidei] = ttl;
>> + if (af == AF_INET) {
>> + if (setsockopt(s, IPPROTO_IP, IP_TTL,
>> + &ttl, sizeof(ttl)) < 0)
>> + perror("setsockopt (IP_TTL)");
>
> This would print to file descriptor 2 even if it's a socket. It should
> be err_perror() instead, but now we also have flow_perror() which
> prints flow index and type, given 'uflow' here, say:
>
> flow_perror(uflow, "IP_TTL setsockopt");
>
>> + } else {
>> + if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
>> + &ttl, sizeof(ttl)) < 0)
>> + perror("setsockopt (IP_TTL)");
>
> ...and this is IPV6_HOPLIMIT, not IP_TTL, so perhaps:
>
> flow_perror(uflow,
> "setsockopt IPV6_HOPLIMIT");
>
Ok.
>> + }
>> + }
>> +
>> count++;
>> }
>>
>> diff --git a/udp.h b/udp.h
>> index de2df6d..041fad4 100644
>> --- a/udp.h
>> +++ b/udp.h
>> @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
>> uint32_t events, const struct timespec *now);
>> int udp_tap_handler(const struct ctx *c, uint8_t pif,
>> sa_family_t af, const void *saddr, const void *daddr,
>> - const struct pool *p, int idx, const struct timespec *now);
>> + uint8_t ttl, const struct pool *p, int idx,
>
> Excess whitespace beetween 'uint8_t' and 'ttl'.
>
>> + const struct timespec *now);
>> int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
>> const char *ifname, in_port_t port);
>> int udp_init(struct ctx *c);
>> diff --git a/udp_flow.c b/udp_flow.c
>> index bf4b896..39372c2 100644
>> --- a/udp_flow.c
>> +++ b/udp_flow.c
>> @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
>> uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
>> uflow->ts = now->tv_sec;
>> uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
>> + uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
>
> By the way, instead of using a default value, what about fetching the
> current value with getsockopt()?
>
> One additional system call per UDP flow doesn't feel like a lot of
> overhead, and we can be sure it's correct, no matter if the user
> configures a different value before or after we start.
>
This patch fixes UDP messaging tap->socket, and TTL may have any
value in the first arriving packet. Reading it from the socket here only
makes sense when I add the same support in direction socket->tap.
That is my next project.
>>
>> if (s_ini >= 0) {
>> /* When using auto port-scanning the listening port could go
>> diff --git a/udp_flow.h b/udp_flow.h
>> index 9a1b059..606ac08 100644
>> --- a/udp_flow.h
>> +++ b/udp_flow.h
>> @@ -21,6 +21,7 @@ struct udp_flow {
>> bool closed :1;
>> time_t ts;
>> int s[SIDES];
>> + uint8_t ttl[SIDES];
>
> Ths should be added to the struct comment above, which, by mistake,
> seems to refer to 'struct udp' by the way (I would fix that right away
> while at it...).
ok.
///jon
>
>> };
>>
>> struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v4] udp: support traceroute
2025-04-03 20:27 ` Jon Maloy
@ 2025-04-03 23:31 ` David Gibson
0 siblings, 0 replies; 4+ messages in thread
From: David Gibson @ 2025-04-03 23:31 UTC (permalink / raw)
To: Jon Maloy; +Cc: passt-dev
[-- Attachment #1: Type: text/plain, Size: 7085 bytes --]
On Thu, Apr 03, 2025 at 04:27:12PM -0400, Jon Maloy wrote:
>
>
> On 2025-04-03 11:48, Stefano Brivio wrote:
> > The implementation looks solid to me, a list of nits (or a bit
> > more) below.
> >
> > By the way, I don't think you need to Cc: people who are already on
> > this list unless you specifically want their attention.
> >
> > On Wed, 2 Apr 2025 22:22:29 -0400
> > Jon Maloy <jmaloy@redhat.com> wrote:
> >
> > > Now that ICMP pass-through from socket-to-tap is in place, it is
> > > easy to support UDP based traceroute functionality in direction
> > > tap-to-socket.
> > >
> > > We fix that in this commit.
> > >
> > > Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> >
> > This fixes https://bugs.passt.top/show_bug.cgi?id=64 ("Link:" tag) if I
> > understood correctly.
> >
> > > ---
> > > v2: - Using ancillary data instead of setsockopt to transfer outgoing
> > > TTL.
> > > - Support IPv6
> > > v3: - Storing ttl per packet instead of per flow. This may not be
> > > elegant, but much less intrusive than changing the flow
>
> [...]
>
> > > @@ -11,6 +11,8 @@
> > > /* Maximum size of a single packet stored in pool, including headers */
> > > #define PACKET_MAX_LEN ((size_t)UINT16_MAX)
> > > +#define DEFAULT_TTL 64
> >
> > If I understood correctly, David's comment to this on v3:
> >
> > https://archives.passt.top/passt-dev/Z-om3Ey-HR1Hj8UH@zatzit/
> >
> > was meant to imply that, as the default value can be changed via
> > sysctl, the value set via sysctl could be read at start-up. I'm fine
> > with 64 as well, by the way, with a slight preference for reading the
> > value via sysctl.
>
> I don't think the local host/container setting will have any effect
> if the sending guest is a VM.
That's true, but..
> The benefit is of this is dubious.
.. uflow->ttl[] isn't so much representing what the guest set, as a
cache of what the socket is sending and that *does* depend on the host
value.
>
> >
> > All this might go away, though, please read the comment to
> > udp_flow_new() below, first.
> >
> > > +
> > > /**
> > > * struct pool - Generic pool of packets stored in a buffer
> > > * @buf: Buffer storing packet descriptors,
> > > diff --git a/tap.c b/tap.c
> > > index 3a6fcbe..e65d592 100644
> > > --- a/tap.c
> > > +++ b/tap.c
> > > @@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
> > > * @dest: Destination port
> > > * @saddr: Source address
> > > * @daddr: Destination address
> > > + * @ttl: Time to live
> > > * @msg: Array of messages that can be handled in a single call
> > > */
> > > static struct tap4_l4_t {
> > > @@ -574,6 +575,8 @@ static struct tap4_l4_t {
> > > struct in_addr saddr;
> > > struct in_addr daddr;
> > > + uint8_t ttl;
> >
> > If you move this after 'protocol' you save 4 or 8 bytes depending on
> > the architecture and, perhaps more importantly, with 64-byte cachelines,
> > you can fit the set of fields involved in the L4_MATCH() comparison
> > four times instead of three. If you have a look with pahole(1):
> >
> Good point. I didn't notice.
>
>
> [...]
> > > const struct flowside *toside;
> > > struct mmsghdr mm[UIO_MAXIOV];
> > > @@ -938,6 +940,19 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
> > > mm[i].msg_hdr.msg_controllen = 0;
> > > mm[i].msg_hdr.msg_flags = 0;
> > > + if (ttl != uflow->ttl[tosidx.sidei]) {
> > > + uflow->ttl[tosidx.sidei] = ttl;
> > > + if (af == AF_INET) {
> > > + if (setsockopt(s, IPPROTO_IP, IP_TTL,
> > > + &ttl, sizeof(ttl)) < 0)
> > > + perror("setsockopt (IP_TTL)");
> >
> > This would print to file descriptor 2 even if it's a socket. It should
> > be err_perror() instead, but now we also have flow_perror() which
> > prints flow index and type, given 'uflow' here, say:
> >
> > flow_perror(uflow, "IP_TTL setsockopt");
> >
> > > + } else {
> > > + if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
> > > + &ttl, sizeof(ttl)) < 0)
> > > + perror("setsockopt (IP_TTL)");
> >
> > ...and this is IPV6_HOPLIMIT, not IP_TTL, so perhaps:
> >
> > flow_perror(uflow,
> > "setsockopt IPV6_HOPLIMIT");
> >
> Ok.
>
> > > + }
> > > + }
> > > +
> > > count++;
> > > }
> > > diff --git a/udp.h b/udp.h
> > > index de2df6d..041fad4 100644
> > > --- a/udp.h
> > > +++ b/udp.h
> > > @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
> > > uint32_t events, const struct timespec *now);
> > > int udp_tap_handler(const struct ctx *c, uint8_t pif,
> > > sa_family_t af, const void *saddr, const void *daddr,
> > > - const struct pool *p, int idx, const struct timespec *now);
> > > + uint8_t ttl, const struct pool *p, int idx,
> >
> > Excess whitespace beetween 'uint8_t' and 'ttl'.
> >
> > > + const struct timespec *now);
> > > int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
> > > const char *ifname, in_port_t port);
> > > int udp_init(struct ctx *c);
> > > diff --git a/udp_flow.c b/udp_flow.c
> > > index bf4b896..39372c2 100644
> > > --- a/udp_flow.c
> > > +++ b/udp_flow.c
> > > @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
> > > uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
> > > uflow->ts = now->tv_sec;
> > > uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
> > > + uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
> >
> > By the way, instead of using a default value, what about fetching the
> > current value with getsockopt()?
> >
> > One additional system call per UDP flow doesn't feel like a lot of
> > overhead, and we can be sure it's correct, no matter if the user
> > configures a different value before or after we start.
> >
> This patch fixes UDP messaging tap->socket, and TTL may have any
> value in the first arriving packet. Reading it from the socket here only
> makes sense when I add the same support in direction socket->tap.
> That is my next project.
>
> > > if (s_ini >= 0) {
> > > /* When using auto port-scanning the listening port could go
> > > diff --git a/udp_flow.h b/udp_flow.h
> > > index 9a1b059..606ac08 100644
> > > --- a/udp_flow.h
> > > +++ b/udp_flow.h
> > > @@ -21,6 +21,7 @@ struct udp_flow {
> > > bool closed :1;
> > > time_t ts;
> > > int s[SIDES];
> > > + uint8_t ttl[SIDES];
> >
> > Ths should be added to the struct comment above, which, by mistake,
> > seems to refer to 'struct udp' by the way (I would fix that right away
> > while at it...).
>
> ok.
>
> ///jon
>
> >
> > > };
> > > struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
> >
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2025-04-03 23:35 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-03 2:22 [PATCH v4] udp: support traceroute Jon Maloy
2025-04-03 15:48 ` Stefano Brivio
2025-04-03 20:27 ` Jon Maloy
2025-04-03 23:31 ` David Gibson
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).