public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH v4] udp: support traceroute
@ 2025-04-03  2:22 Jon Maloy
  2025-04-03 15:48 ` Stefano Brivio
  0 siblings, 1 reply; 4+ messages in thread
From: Jon Maloy @ 2025-04-03  2:22 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

Now that ICMP pass-through from socket-to-tap is in place, it is
easy to support UDP based traceroute functionality in direction
tap-to-socket.

We fix that  in this commit.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v2: - Using ancillary data instead of setsockopt to transfer outgoing
      TTL.
    - Support IPv6
v3: - Storing ttl per packet instead of per flow. This may not be
      elegant, but much less intrusive than changing the flow
      criteria. This eliminates the need for the extra, flow-changing
      patch we introduced in v2.
v4: - Going back to something similar to the original solution, but
      storing current ttl in struct udp_flow, plus ensuring that all
      packets in a struct tap4_l4_t/tap6_l4_t instance, have the same
      ttl. After input from David Gibson.
---
 packet.h   |  2 ++
 tap.c      | 18 ++++++++++++++----
 udp.c      | 17 ++++++++++++++++-
 udp.h      |  3 ++-
 udp_flow.c |  1 +
 udp_flow.h |  1 +
 6 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/packet.h b/packet.h
index c94780a..e84e123 100644
--- a/packet.h
+++ b/packet.h
@@ -11,6 +11,8 @@
 /* Maximum size of a single packet stored in pool, including headers */
 #define PACKET_MAX_LEN	((size_t)UINT16_MAX)
 
+#define DEFAULT_TTL 64
+
 /**
  * struct pool - Generic pool of packets stored in a buffer
  * @buf:	Buffer storing packet descriptors,
diff --git a/tap.c b/tap.c
index 3a6fcbe..e65d592 100644
--- a/tap.c
+++ b/tap.c
@@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
  * @dest:	Destination port
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @ttl:	Time to live
  * @msg:	Array of messages that can be handled in a single call
  */
 static struct tap4_l4_t {
@@ -574,6 +575,8 @@ static struct tap4_l4_t {
 	struct in_addr saddr;
 	struct in_addr daddr;
 
+	uint8_t ttl;
+
 	struct pool_l4_t p;
 } tap4_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
 
@@ -586,6 +589,7 @@ static struct tap4_l4_t {
  * @dest:	Destination port
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @hop_limit:	Hop limit
  * @msg:	Array of messages that can be handled in a single call
  */
 static struct tap6_l4_t {
@@ -598,6 +602,8 @@ static struct tap6_l4_t {
 	struct in6_addr saddr;
 	struct in6_addr daddr;
 
+	uint8_t hop_limit;
+
 	struct pool_l4_t p;
 } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
 
@@ -786,7 +792,8 @@ resume:
 #define L4_MATCH(iph, uh, seq)							\
 	((seq)->protocol == (iph)->protocol &&					\
 	 (seq)->source   == (uh)->source    && (seq)->dest  == (uh)->dest &&	\
-	 (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
+	 (seq)->saddr.s_addr == (iph)->saddr &&				\
+	 (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
 
 #define L4_SET(iph, uh, seq)						\
 	do {								\
@@ -795,6 +802,7 @@ resume:
 		(seq)->dest		= (uh)->dest;			\
 		(seq)->saddr.s_addr	= (iph)->saddr;			\
 		(seq)->daddr.s_addr	= (iph)->daddr;			\
+		(seq)->ttl		= (iph)->ttl;			\
 	} while (0)
 
 		if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
@@ -843,7 +851,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->ttl, p, k, now);
 		}
 	}
 
@@ -966,7 +974,8 @@ resume:
 		 (seq)->dest == (uh)->dest                 &&		\
 		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
 		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
-		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
+		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)  &&		\
+		 (seq)->hop_limit == (ip6h)->hop_limit)
 
 #define L4_SET(ip6h, proto, uh, seq)					\
 	do {								\
@@ -976,6 +985,7 @@ resume:
 		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
 		(seq)->saddr	= *saddr;				\
 		(seq)->daddr	= *daddr;				\
+		(seq)->hop_limit = (ip6h)->hop_limit;			\
 	} while (0)
 
 		if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
@@ -1026,7 +1036,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET6,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->hop_limit, p, k, now);
 		}
 	}
 
diff --git a/udp.c b/udp.c
index 39431d7..bc93292 100644
--- a/udp.c
+++ b/udp.c
@@ -849,6 +849,7 @@ fail:
  * @af:		Address family, AF_INET or AF_INET6
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @ttl:	TTL or hop limit for packets to be sent in this call
  * @p:		Pool of UDP packets, with UDP headers
  * @idx:	Index of first packet to process
  * @now:	Current timestamp
@@ -859,7 +860,8 @@ fail:
  */
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now)
+		    uint8_t ttl, const struct pool *p, int idx,
+		    const struct timespec *now)
 {
 	const struct flowside *toside;
 	struct mmsghdr mm[UIO_MAXIOV];
@@ -938,6 +940,19 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		mm[i].msg_hdr.msg_controllen = 0;
 		mm[i].msg_hdr.msg_flags = 0;
 
+		if (ttl != uflow->ttl[tosidx.sidei]) {
+			uflow->ttl[tosidx.sidei] = ttl;
+			if (af == AF_INET) {
+				if (setsockopt(s, IPPROTO_IP, IP_TTL,
+					       &ttl, sizeof(ttl)) < 0)
+					perror("setsockopt (IP_TTL)");
+			} else {
+				if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
+					       &ttl, sizeof(ttl)) < 0)
+					perror("setsockopt (IP_TTL)");
+			}
+		}
+
 		count++;
 	}
 
diff --git a/udp.h b/udp.h
index de2df6d..041fad4 100644
--- a/udp.h
+++ b/udp.h
@@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now);
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now);
+		    uint8_t  ttl, const struct pool *p, int idx,
+		    const struct timespec *now);
 int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);
 int udp_init(struct ctx *c);
diff --git a/udp_flow.c b/udp_flow.c
index bf4b896..39372c2 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
 	uflow->ts = now->tv_sec;
 	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
+	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
 
 	if (s_ini >= 0) {
 		/* When using auto port-scanning the listening port could go
diff --git a/udp_flow.h b/udp_flow.h
index 9a1b059..606ac08 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -21,6 +21,7 @@ struct udp_flow {
 	bool closed :1;
 	time_t ts;
 	int s[SIDES];
+	uint8_t ttl[SIDES];
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-- 
@@ -21,6 +21,7 @@ struct udp_flow {
 	bool closed :1;
 	time_t ts;
 	int s[SIDES];
+	uint8_t ttl[SIDES];
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v4] udp: support traceroute
  2025-04-03  2:22 [PATCH v4] udp: support traceroute Jon Maloy
@ 2025-04-03 15:48 ` Stefano Brivio
  2025-04-03 20:27   ` Jon Maloy
  0 siblings, 1 reply; 4+ messages in thread
From: Stefano Brivio @ 2025-04-03 15:48 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, lvivier, dgibson

The implementation looks solid to me, a list of nits (or a bit
more) below.

By the way, I don't think you need to Cc: people who are already on
this list unless you specifically want their attention.

On Wed,  2 Apr 2025 22:22:29 -0400
Jon Maloy <jmaloy@redhat.com> wrote:

> Now that ICMP pass-through from socket-to-tap is in place, it is
> easy to support UDP based traceroute functionality in direction
> tap-to-socket.
> 
> We fix that  in this commit.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>

This fixes https://bugs.passt.top/show_bug.cgi?id=64 ("Link:" tag) if I
understood correctly.

> ---
> v2: - Using ancillary data instead of setsockopt to transfer outgoing
>       TTL.
>     - Support IPv6
> v3: - Storing ttl per packet instead of per flow. This may not be
>       elegant, but much less intrusive than changing the flow
>       criteria. This eliminates the need for the extra, flow-changing
>       patch we introduced in v2.
> v4: - Going back to something similar to the original solution, but
>       storing current ttl in struct udp_flow, plus ensuring that all
>       packets in a struct tap4_l4_t/tap6_l4_t instance, have the same
>       ttl. After input from David Gibson.
> ---
>  packet.h   |  2 ++
>  tap.c      | 18 ++++++++++++++----
>  udp.c      | 17 ++++++++++++++++-
>  udp.h      |  3 ++-
>  udp_flow.c |  1 +
>  udp_flow.h |  1 +
>  6 files changed, 36 insertions(+), 6 deletions(-)
> 
> diff --git a/packet.h b/packet.h
> index c94780a..e84e123 100644
> --- a/packet.h
> +++ b/packet.h
> @@ -11,6 +11,8 @@
>  /* Maximum size of a single packet stored in pool, including headers */
>  #define PACKET_MAX_LEN	((size_t)UINT16_MAX)
>  
> +#define DEFAULT_TTL 64

If I understood correctly, David's comment to this on v3:

  https://archives.passt.top/passt-dev/Z-om3Ey-HR1Hj8UH@zatzit/

was meant to imply that, as the default value can be changed via
sysctl, the value set via sysctl could be read at start-up. I'm fine
with 64 as well, by the way, with a slight preference for reading the
value via sysctl.

All this might go away, though, please read the comment to
udp_flow_new() below, first.

> +
>  /**
>   * struct pool - Generic pool of packets stored in a buffer
>   * @buf:	Buffer storing packet descriptors,
> diff --git a/tap.c b/tap.c
> index 3a6fcbe..e65d592 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
>   * @dest:	Destination port
>   * @saddr:	Source address
>   * @daddr:	Destination address
> + * @ttl:	Time to live
>   * @msg:	Array of messages that can be handled in a single call
>   */
>  static struct tap4_l4_t {
> @@ -574,6 +575,8 @@ static struct tap4_l4_t {
>  	struct in_addr saddr;
>  	struct in_addr daddr;
>  
> +	uint8_t ttl;

If you move this after 'protocol' you save 4 or 8 bytes depending on
the architecture and, perhaps more importantly, with 64-byte cachelines,
you can fit the set of fields involved in the L4_MATCH() comparison
four times instead of three. If you have a look with pahole(1):

--
struct tap4_l4_t {
        uint8_t                    protocol;             /*     0     1 */

        /* XXX 1 byte hole, try to pack */

        uint16_t                   source;               /*     2     2 */
        uint16_t                   dest;                 /*     4     2 */

        /* XXX 2 bytes hole, try to pack */

        struct in_addr             saddr;                /*     8     4 */
        struct in_addr             daddr;                /*    12     4 */
        uint8_t                    ttl;                  /*    16     1 */

        /* XXX 7 bytes hole, try to pack */

	...
}
--

becomes:

--
struct tap4_l4_t {
        uint8_t                    protocol;             /*     0     1 */
        uint8_t                    ttl;                  /*     1     1 */
        uint16_t                   source;               /*     2     2 */
        uint16_t                   dest;                 /*     4     2 */

        /* XXX 2 bytes hole, try to pack */

        struct in_addr             saddr;                /*     8     4 */
        struct in_addr             daddr;                /*    12     4 */
	...
}
--

...if you move it, please don't forget to update the comment to the
struct.

> +
>  	struct pool_l4_t p;
>  } tap4_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
>  
> @@ -586,6 +589,7 @@ static struct tap4_l4_t {
>   * @dest:	Destination port
>   * @saddr:	Source address
>   * @daddr:	Destination address
> + * @hop_limit:	Hop limit
>   * @msg:	Array of messages that can be handled in a single call
>   */
>  static struct tap6_l4_t {
> @@ -598,6 +602,8 @@ static struct tap6_l4_t {
>  	struct in6_addr saddr;
>  	struct in6_addr daddr;
>  
> +	uint8_t hop_limit;

Here, instead, it doesn't matter, because 'p' starts at 48 bytes anyway,
and we compare the flow label too.

> +
>  	struct pool_l4_t p;
>  } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
>  
> @@ -786,7 +792,8 @@ resume:
>  #define L4_MATCH(iph, uh, seq)							\
>  	((seq)->protocol == (iph)->protocol &&					\
>  	 (seq)->source   == (uh)->source    && (seq)->dest  == (uh)->dest &&	\
> -	 (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
> +	 (seq)->saddr.s_addr == (iph)->saddr &&				\
> +	 (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
>  
>  #define L4_SET(iph, uh, seq)						\
>  	do {								\
> @@ -795,6 +802,7 @@ resume:
>  		(seq)->dest		= (uh)->dest;			\
>  		(seq)->saddr.s_addr	= (iph)->saddr;			\
>  		(seq)->daddr.s_addr	= (iph)->daddr;			\
> +		(seq)->ttl		= (iph)->ttl;			\
>  	} while (0)
>  
>  		if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
> @@ -843,7 +851,7 @@ append:
>  			for (k = 0; k < p->count; )
>  				k += udp_tap_handler(c, PIF_TAP, AF_INET,
>  						     &seq->saddr, &seq->daddr,
> -						     p, k, now);
> +						     seq->ttl, p, k, now);
>  		}
>  	}
>  
> @@ -966,7 +974,8 @@ resume:
>  		 (seq)->dest == (uh)->dest                 &&		\
>  		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
>  		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
> -		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
> +		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)  &&		\
> +		 (seq)->hop_limit == (ip6h)->hop_limit)
>  
>  #define L4_SET(ip6h, proto, uh, seq)					\
>  	do {								\
> @@ -976,6 +985,7 @@ resume:
>  		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
>  		(seq)->saddr	= *saddr;				\
>  		(seq)->daddr	= *daddr;				\
> +		(seq)->hop_limit = (ip6h)->hop_limit;			\
>  	} while (0)
>  
>  		if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
> @@ -1026,7 +1036,7 @@ append:
>  			for (k = 0; k < p->count; )
>  				k += udp_tap_handler(c, PIF_TAP, AF_INET6,
>  						     &seq->saddr, &seq->daddr,
> -						     p, k, now);
> +						     seq->hop_limit, p, k, now);
>  		}
>  	}
>  
> diff --git a/udp.c b/udp.c
> index 39431d7..bc93292 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -849,6 +849,7 @@ fail:
>   * @af:		Address family, AF_INET or AF_INET6
>   * @saddr:	Source address
>   * @daddr:	Destination address
> + * @ttl:	TTL or hop limit for packets to be sent in this call
>   * @p:		Pool of UDP packets, with UDP headers
>   * @idx:	Index of first packet to process
>   * @now:	Current timestamp
> @@ -859,7 +860,8 @@ fail:
>   */
>  int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		    sa_family_t af, const void *saddr, const void *daddr,
> -		    const struct pool *p, int idx, const struct timespec *now)
> +		    uint8_t ttl, const struct pool *p, int idx,
> +		    const struct timespec *now)
>  {
>  	const struct flowside *toside;
>  	struct mmsghdr mm[UIO_MAXIOV];
> @@ -938,6 +940,19 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		mm[i].msg_hdr.msg_controllen = 0;
>  		mm[i].msg_hdr.msg_flags = 0;
>  
> +		if (ttl != uflow->ttl[tosidx.sidei]) {
> +			uflow->ttl[tosidx.sidei] = ttl;
> +			if (af == AF_INET) {
> +				if (setsockopt(s, IPPROTO_IP, IP_TTL,
> +					       &ttl, sizeof(ttl)) < 0)
> +					perror("setsockopt (IP_TTL)");

This would print to file descriptor 2 even if it's a socket. It should
be err_perror() instead, but now we also have flow_perror() which
prints flow index and type, given 'uflow' here, say:

					flow_perror(uflow, "IP_TTL setsockopt");

> +			} else {
> +				if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
> +					       &ttl, sizeof(ttl)) < 0)
> +					perror("setsockopt (IP_TTL)");

...and this is IPV6_HOPLIMIT, not IP_TTL, so perhaps:

					flow_perror(uflow,
						    "setsockopt IPV6_HOPLIMIT");

> +			}
> +		}
> +
>  		count++;
>  	}
>  
> diff --git a/udp.h b/udp.h
> index de2df6d..041fad4 100644
> --- a/udp.h
> +++ b/udp.h
> @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
>  			    uint32_t events, const struct timespec *now);
>  int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		    sa_family_t af, const void *saddr, const void *daddr,
> -		    const struct pool *p, int idx, const struct timespec *now);
> +		    uint8_t  ttl, const struct pool *p, int idx,

Excess whitespace beetween 'uint8_t' and 'ttl'.

> +		    const struct timespec *now);
>  int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
>  		  const char *ifname, in_port_t port);
>  int udp_init(struct ctx *c);
> diff --git a/udp_flow.c b/udp_flow.c
> index bf4b896..39372c2 100644
> --- a/udp_flow.c
> +++ b/udp_flow.c
> @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
>  	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
>  	uflow->ts = now->tv_sec;
>  	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
> +	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;

By the way, instead of using a default value, what about fetching the
current value with getsockopt()?

One additional system call per UDP flow doesn't feel like a lot of
overhead, and we can be sure it's correct, no matter if the user
configures a different value before or after we start.

>  
>  	if (s_ini >= 0) {
>  		/* When using auto port-scanning the listening port could go
> diff --git a/udp_flow.h b/udp_flow.h
> index 9a1b059..606ac08 100644
> --- a/udp_flow.h
> +++ b/udp_flow.h
> @@ -21,6 +21,7 @@ struct udp_flow {
>  	bool closed :1;
>  	time_t ts;
>  	int s[SIDES];
> +	uint8_t ttl[SIDES];

Ths should be added to the struct comment above, which, by mistake,
seems to refer to 'struct udp' by the way (I would fix that right away
while at it...).

>  };
>  
>  struct udp_flow *udp_at_sidx(flow_sidx_t sidx);

-- 
Stefano


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v4] udp: support traceroute
  2025-04-03 15:48 ` Stefano Brivio
@ 2025-04-03 20:27   ` Jon Maloy
  2025-04-03 23:31     ` David Gibson
  0 siblings, 1 reply; 4+ messages in thread
From: Jon Maloy @ 2025-04-03 20:27 UTC (permalink / raw)
  To: passt-dev



On 2025-04-03 11:48, Stefano Brivio wrote:
> The implementation looks solid to me, a list of nits (or a bit
> more) below.
> 
> By the way, I don't think you need to Cc: people who are already on
> this list unless you specifically want their attention.
> 
> On Wed,  2 Apr 2025 22:22:29 -0400
> Jon Maloy <jmaloy@redhat.com> wrote:
> 
>> Now that ICMP pass-through from socket-to-tap is in place, it is
>> easy to support UDP based traceroute functionality in direction
>> tap-to-socket.
>>
>> We fix that  in this commit.
>>
>> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> 
> This fixes https://bugs.passt.top/show_bug.cgi?id=64 ("Link:" tag) if I
> understood correctly.
> 
>> ---
>> v2: - Using ancillary data instead of setsockopt to transfer outgoing
>>        TTL.
>>      - Support IPv6
>> v3: - Storing ttl per packet instead of per flow. This may not be
>>        elegant, but much less intrusive than changing the flow

[...]

>> @@ -11,6 +11,8 @@
>>   /* Maximum size of a single packet stored in pool, including headers */
>>   #define PACKET_MAX_LEN	((size_t)UINT16_MAX)
>>   
>> +#define DEFAULT_TTL 64
> 
> If I understood correctly, David's comment to this on v3:
> 
>    https://archives.passt.top/passt-dev/Z-om3Ey-HR1Hj8UH@zatzit/
> 
> was meant to imply that, as the default value can be changed via
> sysctl, the value set via sysctl could be read at start-up. I'm fine
> with 64 as well, by the way, with a slight preference for reading the
> value via sysctl.

I don't think the local host/container setting will have any effect
if the sending guest is a VM. The benefit is of this is dubious.

> 
> All this might go away, though, please read the comment to
> udp_flow_new() below, first.
> 
>> +
>>   /**
>>    * struct pool - Generic pool of packets stored in a buffer
>>    * @buf:	Buffer storing packet descriptors,
>> diff --git a/tap.c b/tap.c
>> index 3a6fcbe..e65d592 100644
>> --- a/tap.c
>> +++ b/tap.c
>> @@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
>>    * @dest:	Destination port
>>    * @saddr:	Source address
>>    * @daddr:	Destination address
>> + * @ttl:	Time to live
>>    * @msg:	Array of messages that can be handled in a single call
>>    */
>>   static struct tap4_l4_t {
>> @@ -574,6 +575,8 @@ static struct tap4_l4_t {
>>   	struct in_addr saddr;
>>   	struct in_addr daddr;
>>   
>> +	uint8_t ttl;
> 
> If you move this after 'protocol' you save 4 or 8 bytes depending on
> the architecture and, perhaps more importantly, with 64-byte cachelines,
> you can fit the set of fields involved in the L4_MATCH() comparison
> four times instead of three. If you have a look with pahole(1):
> 
> --
> struct tap4_l4_t {
>          uint8_t                    protocol;             /*     0     1 */
> 
>          /* XXX 1 byte hole, try to pack */
> 
>          uint16_t                   source;               /*     2     2 */
>          uint16_t                   dest;                 /*     4     2 */
> 
>          /* XXX 2 bytes hole, try to pack */
> 
>          struct in_addr             saddr;                /*     8     4 */
>          struct in_addr             daddr;                /*    12     4 */
>          uint8_t                    ttl;                  /*    16     1 */
> 
>          /* XXX 7 bytes hole, try to pack */
> 
> 	...
> }
> --
> 
> becomes:
> 
> --
> struct tap4_l4_t {
>          uint8_t                    protocol;             /*     0     1 */
>          uint8_t                    ttl;                  /*     1     1 */
>          uint16_t                   source;               /*     2     2 */
>          uint16_t                   dest;                 /*     4     2 */
> 
>          /* XXX 2 bytes hole, try to pack */
> 
>          struct in_addr             saddr;                /*     8     4 */
>          struct in_addr             daddr;                /*    12     4 */
> 	...
> }
Good point. I  didn't notice.

> --
> 
> ...if you move it, please don't forget to update the comment to the
> struct.
> 
>> +
>>   	struct pool_l4_t p;

[...]
>>   	const struct flowside *toside;
>>   	struct mmsghdr mm[UIO_MAXIOV];
>> @@ -938,6 +940,19 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
>>   		mm[i].msg_hdr.msg_controllen = 0;
>>   		mm[i].msg_hdr.msg_flags = 0;
>>   
>> +		if (ttl != uflow->ttl[tosidx.sidei]) {
>> +			uflow->ttl[tosidx.sidei] = ttl;
>> +			if (af == AF_INET) {
>> +				if (setsockopt(s, IPPROTO_IP, IP_TTL,
>> +					       &ttl, sizeof(ttl)) < 0)
>> +					perror("setsockopt (IP_TTL)");
> 
> This would print to file descriptor 2 even if it's a socket. It should
> be err_perror() instead, but now we also have flow_perror() which
> prints flow index and type, given 'uflow' here, say:
> 
> 					flow_perror(uflow, "IP_TTL setsockopt");
> 
>> +			} else {
>> +				if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
>> +					       &ttl, sizeof(ttl)) < 0)
>> +					perror("setsockopt (IP_TTL)");
> 
> ...and this is IPV6_HOPLIMIT, not IP_TTL, so perhaps:
> 
> 					flow_perror(uflow,
> 						    "setsockopt IPV6_HOPLIMIT");
>
Ok.

>> +			}
>> +		}
>> +
>>   		count++;
>>   	}
>>   
>> diff --git a/udp.h b/udp.h
>> index de2df6d..041fad4 100644
>> --- a/udp.h
>> +++ b/udp.h
>> @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
>>   			    uint32_t events, const struct timespec *now);
>>   int udp_tap_handler(const struct ctx *c, uint8_t pif,
>>   		    sa_family_t af, const void *saddr, const void *daddr,
>> -		    const struct pool *p, int idx, const struct timespec *now);
>> +		    uint8_t  ttl, const struct pool *p, int idx,
> 
> Excess whitespace beetween 'uint8_t' and 'ttl'.
> 
>> +		    const struct timespec *now);
>>   int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
>>   		  const char *ifname, in_port_t port);
>>   int udp_init(struct ctx *c);
>> diff --git a/udp_flow.c b/udp_flow.c
>> index bf4b896..39372c2 100644
>> --- a/udp_flow.c
>> +++ b/udp_flow.c
>> @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
>>   	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
>>   	uflow->ts = now->tv_sec;
>>   	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
>> +	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
> 
> By the way, instead of using a default value, what about fetching the
> current value with getsockopt()?
> 
> One additional system call per UDP flow doesn't feel like a lot of
> overhead, and we can be sure it's correct, no matter if the user
> configures a different value before or after we start.
> 
This patch fixes UDP messaging tap->socket, and TTL may have any
value in the first arriving packet. Reading it from the socket here only 
makes sense when I add the same support in direction socket->tap.
That is my next project.

>>   
>>   	if (s_ini >= 0) {
>>   		/* When using auto port-scanning the listening port could go
>> diff --git a/udp_flow.h b/udp_flow.h
>> index 9a1b059..606ac08 100644
>> --- a/udp_flow.h
>> +++ b/udp_flow.h
>> @@ -21,6 +21,7 @@ struct udp_flow {
>>   	bool closed :1;
>>   	time_t ts;
>>   	int s[SIDES];
>> +	uint8_t ttl[SIDES];
> 
> Ths should be added to the struct comment above, which, by mistake,
> seems to refer to 'struct udp' by the way (I would fix that right away
> while at it...).

ok.

///jon

> 
>>   };
>>   
>>   struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v4] udp: support traceroute
  2025-04-03 20:27   ` Jon Maloy
@ 2025-04-03 23:31     ` David Gibson
  0 siblings, 0 replies; 4+ messages in thread
From: David Gibson @ 2025-04-03 23:31 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 7085 bytes --]

On Thu, Apr 03, 2025 at 04:27:12PM -0400, Jon Maloy wrote:
> 
> 
> On 2025-04-03 11:48, Stefano Brivio wrote:
> > The implementation looks solid to me, a list of nits (or a bit
> > more) below.
> > 
> > By the way, I don't think you need to Cc: people who are already on
> > this list unless you specifically want their attention.
> > 
> > On Wed,  2 Apr 2025 22:22:29 -0400
> > Jon Maloy <jmaloy@redhat.com> wrote:
> > 
> > > Now that ICMP pass-through from socket-to-tap is in place, it is
> > > easy to support UDP based traceroute functionality in direction
> > > tap-to-socket.
> > > 
> > > We fix that  in this commit.
> > > 
> > > Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> > 
> > This fixes https://bugs.passt.top/show_bug.cgi?id=64 ("Link:" tag) if I
> > understood correctly.
> > 
> > > ---
> > > v2: - Using ancillary data instead of setsockopt to transfer outgoing
> > >        TTL.
> > >      - Support IPv6
> > > v3: - Storing ttl per packet instead of per flow. This may not be
> > >        elegant, but much less intrusive than changing the flow
> 
> [...]
> 
> > > @@ -11,6 +11,8 @@
> > >   /* Maximum size of a single packet stored in pool, including headers */
> > >   #define PACKET_MAX_LEN	((size_t)UINT16_MAX)
> > > +#define DEFAULT_TTL 64
> > 
> > If I understood correctly, David's comment to this on v3:
> > 
> >    https://archives.passt.top/passt-dev/Z-om3Ey-HR1Hj8UH@zatzit/
> > 
> > was meant to imply that, as the default value can be changed via
> > sysctl, the value set via sysctl could be read at start-up. I'm fine
> > with 64 as well, by the way, with a slight preference for reading the
> > value via sysctl.
> 
> I don't think the local host/container setting will have any effect
> if the sending guest is a VM.

That's true, but..

> The benefit is of this is dubious.

.. uflow->ttl[] isn't so much representing what the guest set, as a
cache of what the socket is sending and that *does* depend on the host
value.

> 
> > 
> > All this might go away, though, please read the comment to
> > udp_flow_new() below, first.
> > 
> > > +
> > >   /**
> > >    * struct pool - Generic pool of packets stored in a buffer
> > >    * @buf:	Buffer storing packet descriptors,
> > > diff --git a/tap.c b/tap.c
> > > index 3a6fcbe..e65d592 100644
> > > --- a/tap.c
> > > +++ b/tap.c
> > > @@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
> > >    * @dest:	Destination port
> > >    * @saddr:	Source address
> > >    * @daddr:	Destination address
> > > + * @ttl:	Time to live
> > >    * @msg:	Array of messages that can be handled in a single call
> > >    */
> > >   static struct tap4_l4_t {
> > > @@ -574,6 +575,8 @@ static struct tap4_l4_t {
> > >   	struct in_addr saddr;
> > >   	struct in_addr daddr;
> > > +	uint8_t ttl;
> > 
> > If you move this after 'protocol' you save 4 or 8 bytes depending on
> > the architecture and, perhaps more importantly, with 64-byte cachelines,
> > you can fit the set of fields involved in the L4_MATCH() comparison
> > four times instead of three. If you have a look with pahole(1):
> > 
> Good point. I  didn't notice.
> 
> 
> [...]
> > >   	const struct flowside *toside;
> > >   	struct mmsghdr mm[UIO_MAXIOV];
> > > @@ -938,6 +940,19 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
> > >   		mm[i].msg_hdr.msg_controllen = 0;
> > >   		mm[i].msg_hdr.msg_flags = 0;
> > > +		if (ttl != uflow->ttl[tosidx.sidei]) {
> > > +			uflow->ttl[tosidx.sidei] = ttl;
> > > +			if (af == AF_INET) {
> > > +				if (setsockopt(s, IPPROTO_IP, IP_TTL,
> > > +					       &ttl, sizeof(ttl)) < 0)
> > > +					perror("setsockopt (IP_TTL)");
> > 
> > This would print to file descriptor 2 even if it's a socket. It should
> > be err_perror() instead, but now we also have flow_perror() which
> > prints flow index and type, given 'uflow' here, say:
> > 
> > 					flow_perror(uflow, "IP_TTL setsockopt");
> > 
> > > +			} else {
> > > +				if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
> > > +					       &ttl, sizeof(ttl)) < 0)
> > > +					perror("setsockopt (IP_TTL)");
> > 
> > ...and this is IPV6_HOPLIMIT, not IP_TTL, so perhaps:
> > 
> > 					flow_perror(uflow,
> > 						    "setsockopt IPV6_HOPLIMIT");
> > 
> Ok.
> 
> > > +			}
> > > +		}
> > > +
> > >   		count++;
> > >   	}
> > > diff --git a/udp.h b/udp.h
> > > index de2df6d..041fad4 100644
> > > --- a/udp.h
> > > +++ b/udp.h
> > > @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
> > >   			    uint32_t events, const struct timespec *now);
> > >   int udp_tap_handler(const struct ctx *c, uint8_t pif,
> > >   		    sa_family_t af, const void *saddr, const void *daddr,
> > > -		    const struct pool *p, int idx, const struct timespec *now);
> > > +		    uint8_t  ttl, const struct pool *p, int idx,
> > 
> > Excess whitespace beetween 'uint8_t' and 'ttl'.
> > 
> > > +		    const struct timespec *now);
> > >   int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
> > >   		  const char *ifname, in_port_t port);
> > >   int udp_init(struct ctx *c);
> > > diff --git a/udp_flow.c b/udp_flow.c
> > > index bf4b896..39372c2 100644
> > > --- a/udp_flow.c
> > > +++ b/udp_flow.c
> > > @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
> > >   	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
> > >   	uflow->ts = now->tv_sec;
> > >   	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
> > > +	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
> > 
> > By the way, instead of using a default value, what about fetching the
> > current value with getsockopt()?
> > 
> > One additional system call per UDP flow doesn't feel like a lot of
> > overhead, and we can be sure it's correct, no matter if the user
> > configures a different value before or after we start.
> > 
> This patch fixes UDP messaging tap->socket, and TTL may have any
> value in the first arriving packet. Reading it from the socket here only
> makes sense when I add the same support in direction socket->tap.
> That is my next project.
> 
> > >   	if (s_ini >= 0) {
> > >   		/* When using auto port-scanning the listening port could go
> > > diff --git a/udp_flow.h b/udp_flow.h
> > > index 9a1b059..606ac08 100644
> > > --- a/udp_flow.h
> > > +++ b/udp_flow.h
> > > @@ -21,6 +21,7 @@ struct udp_flow {
> > >   	bool closed :1;
> > >   	time_t ts;
> > >   	int s[SIDES];
> > > +	uint8_t ttl[SIDES];
> > 
> > Ths should be added to the struct comment above, which, by mistake,
> > seems to refer to 'struct udp' by the way (I would fix that right away
> > while at it...).
> 
> ok.
> 
> ///jon
> 
> > 
> > >   };
> > >   struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
> > 
> 

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2025-04-03 23:35 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-03  2:22 [PATCH v4] udp: support traceroute Jon Maloy
2025-04-03 15:48 ` Stefano Brivio
2025-04-03 20:27   ` Jon Maloy
2025-04-03 23:31     ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).