public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH v5] udp: support traceroute
@ 2025-04-03 22:27 Jon Maloy
  2025-04-03 23:40 ` David Gibson
  0 siblings, 1 reply; 5+ messages in thread
From: Jon Maloy @ 2025-04-03 22:27 UTC (permalink / raw)
  To: passt-dev

Now that ICMP pass-through from socket-to-tap is in place, it is
easy to support UDP based traceroute functionality in direction
tap-to-socket.

We fix that  in this commit.

Link: https://bugs.passt.top/show_bug.cgi?id=64
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v2: - Using ancillary data instead of setsockopt to transfer outgoing
      TTL.
    - Support IPv6
v3: - Storing ttl per packet instead of per flow. This may not be
      elegant, but much less intrusive than changing the flow
      criteria. This eliminates the need for the extra, flow-changing
      patch we introduced in v2.
v4: - Going back to something similar to the original solution, but
      storing current ttl in struct udp_flow, plus ensuring that all
      packets in a struct tap4_l4_t/tap6_l4_t instance have the same
      ttl. After input from David Gibson.
v5: - Some minor fixes after feedback from Stefano Brivio.
---
 packet.h   |  2 ++
 tap.c      | 17 +++++++++++++----
 udp.c      | 19 ++++++++++++++++++-
 udp.h      |  3 ++-
 udp_flow.c |  1 +
 udp_flow.h |  4 +++-
 6 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/packet.h b/packet.h
index c94780a..e84e123 100644
--- a/packet.h
+++ b/packet.h
@@ -11,6 +11,8 @@
 /* Maximum size of a single packet stored in pool, including headers */
 #define PACKET_MAX_LEN	((size_t)UINT16_MAX)
 
+#define DEFAULT_TTL 64
+
 /**
  * struct pool - Generic pool of packets stored in a buffer
  * @buf:	Buffer storing packet descriptors,
diff --git a/tap.c b/tap.c
index 3a6fcbe..d630f6d 100644
--- a/tap.c
+++ b/tap.c
@@ -559,6 +559,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
  * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
  * @msgs:	Count of messages in sequence
  * @protocol:	Protocol number
+ * @ttl:	Time to live
  * @source:	Source port
  * @dest:	Destination port
  * @saddr:	Source address
@@ -567,6 +568,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
  */
 static struct tap4_l4_t {
 	uint8_t protocol;
+	uint8_t ttl;
 
 	uint16_t source;
 	uint16_t dest;
@@ -586,6 +588,7 @@ static struct tap4_l4_t {
  * @dest:	Destination port
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @hop_limit:	Hop limit
  * @msg:	Array of messages that can be handled in a single call
  */
 static struct tap6_l4_t {
@@ -598,6 +601,8 @@ static struct tap6_l4_t {
 	struct in6_addr saddr;
 	struct in6_addr daddr;
 
+	uint8_t hop_limit;
+
 	struct pool_l4_t p;
 } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
 
@@ -786,7 +791,8 @@ resume:
 #define L4_MATCH(iph, uh, seq)							\
 	((seq)->protocol == (iph)->protocol &&					\
 	 (seq)->source   == (uh)->source    && (seq)->dest  == (uh)->dest &&	\
-	 (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
+	 (seq)->saddr.s_addr == (iph)->saddr &&				\
+	 (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
 
 #define L4_SET(iph, uh, seq)						\
 	do {								\
@@ -795,6 +801,7 @@ resume:
 		(seq)->dest		= (uh)->dest;			\
 		(seq)->saddr.s_addr	= (iph)->saddr;			\
 		(seq)->daddr.s_addr	= (iph)->daddr;			\
+		(seq)->ttl		= (iph)->ttl;			\
 	} while (0)
 
 		if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
@@ -843,7 +850,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->ttl, p, k, now);
 		}
 	}
 
@@ -966,7 +973,8 @@ resume:
 		 (seq)->dest == (uh)->dest                 &&		\
 		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
 		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
-		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
+		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)  &&		\
+		 (seq)->hop_limit == (ip6h)->hop_limit)
 
 #define L4_SET(ip6h, proto, uh, seq)					\
 	do {								\
@@ -976,6 +984,7 @@ resume:
 		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
 		(seq)->saddr	= *saddr;				\
 		(seq)->daddr	= *daddr;				\
+		(seq)->hop_limit = (ip6h)->hop_limit;			\
 	} while (0)
 
 		if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
@@ -1026,7 +1035,7 @@ append:
 			for (k = 0; k < p->count; )
 				k += udp_tap_handler(c, PIF_TAP, AF_INET6,
 						     &seq->saddr, &seq->daddr,
-						     p, k, now);
+						     seq->hop_limit, p, k, now);
 		}
 	}
 
diff --git a/udp.c b/udp.c
index 39431d7..618a4e2 100644
--- a/udp.c
+++ b/udp.c
@@ -849,6 +849,7 @@ fail:
  * @af:		Address family, AF_INET or AF_INET6
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @ttl:	TTL or hop limit for packets to be sent in this call
  * @p:		Pool of UDP packets, with UDP headers
  * @idx:	Index of first packet to process
  * @now:	Current timestamp
@@ -859,7 +860,8 @@ fail:
  */
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now)
+		    uint8_t ttl, const struct pool *p, int idx,
+		    const struct timespec *now)
 {
 	const struct flowside *toside;
 	struct mmsghdr mm[UIO_MAXIOV];
@@ -938,6 +940,21 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		mm[i].msg_hdr.msg_controllen = 0;
 		mm[i].msg_hdr.msg_flags = 0;
 
+		if (ttl != uflow->ttl[tosidx.sidei]) {
+			uflow->ttl[tosidx.sidei] = ttl;
+			if (af == AF_INET) {
+				if (setsockopt(s, IPPROTO_IP, IP_TTL,
+					       &ttl, sizeof(ttl)) < 0)
+					flow_perror(uflow,
+						    "setsockopt IP_TTL");
+			} else {
+				if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
+					       &ttl, sizeof(ttl)) < 0)
+					flow_perror(uflow,
+						    "setsockopt IPV6_HOPLIMIT");
+			}
+		}
+
 		count++;
 	}
 
diff --git a/udp.h b/udp.h
index de2df6d..a811475 100644
--- a/udp.h
+++ b/udp.h
@@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now);
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now);
+		    uint8_t ttl, const struct pool *p, int idx,
+		    const struct timespec *now);
 int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);
 int udp_init(struct ctx *c);
diff --git a/udp_flow.c b/udp_flow.c
index bf4b896..39372c2 100644
--- a/udp_flow.c
+++ b/udp_flow.c
@@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
 	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
 	uflow->ts = now->tv_sec;
 	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
+	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
 
 	if (s_ini >= 0) {
 		/* When using auto port-scanning the listening port could go
diff --git a/udp_flow.h b/udp_flow.h
index 9a1b059..520de62 100644
--- a/udp_flow.h
+++ b/udp_flow.h
@@ -8,11 +8,12 @@
 #define UDP_FLOW_H
 
 /**
- * struct udp - Descriptor for a flow of UDP packets
+ * struct udp_flow - Descriptor for a flow of UDP packets
  * @f:		Generic flow information
  * @closed:	Flow is already closed
  * @ts:		Activity timestamp
  * @s:		Socket fd (or -1) for each side of the flow
+ * @ttl:	TTL or hop_limit for both sides
  */
 struct udp_flow {
 	/* Must be first element */
@@ -21,6 +22,7 @@ struct udp_flow {
 	bool closed :1;
 	time_t ts;
 	int s[SIDES];
+	uint8_t ttl[SIDES];
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-- 
@@ -8,11 +8,12 @@
 #define UDP_FLOW_H
 
 /**
- * struct udp - Descriptor for a flow of UDP packets
+ * struct udp_flow - Descriptor for a flow of UDP packets
  * @f:		Generic flow information
  * @closed:	Flow is already closed
  * @ts:		Activity timestamp
  * @s:		Socket fd (or -1) for each side of the flow
+ * @ttl:	TTL or hop_limit for both sides
  */
 struct udp_flow {
 	/* Must be first element */
@@ -21,6 +22,7 @@ struct udp_flow {
 	bool closed :1;
 	time_t ts;
 	int s[SIDES];
+	uint8_t ttl[SIDES];
 };
 
 struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] udp: support traceroute
  2025-04-03 22:27 [PATCH v5] udp: support traceroute Jon Maloy
@ 2025-04-03 23:40 ` David Gibson
  2025-04-04  0:57   ` David Gibson
  0 siblings, 1 reply; 5+ messages in thread
From: David Gibson @ 2025-04-03 23:40 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 8688 bytes --]

On Thu, Apr 03, 2025 at 06:27:06PM -0400, Jon Maloy wrote:
> Now that ICMP pass-through from socket-to-tap is in place, it is
> easy to support UDP based traceroute functionality in direction
> tap-to-socket.
> 
> We fix that  in this commit.
> 
> Link: https://bugs.passt.top/show_bug.cgi?id=64
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

One commont below.

> ---
> v2: - Using ancillary data instead of setsockopt to transfer outgoing
>       TTL.
>     - Support IPv6
> v3: - Storing ttl per packet instead of per flow. This may not be
>       elegant, but much less intrusive than changing the flow
>       criteria. This eliminates the need for the extra, flow-changing
>       patch we introduced in v2.
> v4: - Going back to something similar to the original solution, but
>       storing current ttl in struct udp_flow, plus ensuring that all
>       packets in a struct tap4_l4_t/tap6_l4_t instance have the same
>       ttl. After input from David Gibson.
> v5: - Some minor fixes after feedback from Stefano Brivio.
> ---
>  packet.h   |  2 ++
>  tap.c      | 17 +++++++++++++----
>  udp.c      | 19 ++++++++++++++++++-
>  udp.h      |  3 ++-
>  udp_flow.c |  1 +
>  udp_flow.h |  4 +++-
>  6 files changed, 39 insertions(+), 7 deletions(-)
> 
> diff --git a/packet.h b/packet.h
> index c94780a..e84e123 100644
> --- a/packet.h
> +++ b/packet.h
> @@ -11,6 +11,8 @@
>  /* Maximum size of a single packet stored in pool, including headers */
>  #define PACKET_MAX_LEN	((size_t)UINT16_MAX)
>  
> +#define DEFAULT_TTL 64

This is still fixed, rather than either probing the sysctl or using
getsockopt() to determine the initial value.  I don't think we want to
delay this further to change that, but it could be a reasonable
follow up improvement.

> +
>  /**
>   * struct pool - Generic pool of packets stored in a buffer
>   * @buf:	Buffer storing packet descriptors,
> diff --git a/tap.c b/tap.c
> index 3a6fcbe..d630f6d 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -559,6 +559,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
>   * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
>   * @msgs:	Count of messages in sequence
>   * @protocol:	Protocol number
> + * @ttl:	Time to live
>   * @source:	Source port
>   * @dest:	Destination port
>   * @saddr:	Source address
> @@ -567,6 +568,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
>   */
>  static struct tap4_l4_t {
>  	uint8_t protocol;
> +	uint8_t ttl;
>  
>  	uint16_t source;
>  	uint16_t dest;
> @@ -586,6 +588,7 @@ static struct tap4_l4_t {
>   * @dest:	Destination port
>   * @saddr:	Source address
>   * @daddr:	Destination address
> + * @hop_limit:	Hop limit
>   * @msg:	Array of messages that can be handled in a single call
>   */
>  static struct tap6_l4_t {
> @@ -598,6 +601,8 @@ static struct tap6_l4_t {
>  	struct in6_addr saddr;
>  	struct in6_addr daddr;
>  
> +	uint8_t hop_limit;
> +
>  	struct pool_l4_t p;
>  } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
>  
> @@ -786,7 +791,8 @@ resume:
>  #define L4_MATCH(iph, uh, seq)							\
>  	((seq)->protocol == (iph)->protocol &&					\
>  	 (seq)->source   == (uh)->source    && (seq)->dest  == (uh)->dest &&	\
> -	 (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
> +	 (seq)->saddr.s_addr == (iph)->saddr &&				\
> +	 (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
>  
>  #define L4_SET(iph, uh, seq)						\
>  	do {								\
> @@ -795,6 +801,7 @@ resume:
>  		(seq)->dest		= (uh)->dest;			\
>  		(seq)->saddr.s_addr	= (iph)->saddr;			\
>  		(seq)->daddr.s_addr	= (iph)->daddr;			\
> +		(seq)->ttl		= (iph)->ttl;			\
>  	} while (0)
>  
>  		if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
> @@ -843,7 +850,7 @@ append:
>  			for (k = 0; k < p->count; )
>  				k += udp_tap_handler(c, PIF_TAP, AF_INET,
>  						     &seq->saddr, &seq->daddr,
> -						     p, k, now);
> +						     seq->ttl, p, k, now);
>  		}
>  	}
>  
> @@ -966,7 +973,8 @@ resume:
>  		 (seq)->dest == (uh)->dest                 &&		\
>  		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
>  		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
> -		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
> +		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)  &&		\
> +		 (seq)->hop_limit == (ip6h)->hop_limit)
>  
>  #define L4_SET(ip6h, proto, uh, seq)					\
>  	do {								\
> @@ -976,6 +984,7 @@ resume:
>  		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
>  		(seq)->saddr	= *saddr;				\
>  		(seq)->daddr	= *daddr;				\
> +		(seq)->hop_limit = (ip6h)->hop_limit;			\
>  	} while (0)
>  
>  		if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
> @@ -1026,7 +1035,7 @@ append:
>  			for (k = 0; k < p->count; )
>  				k += udp_tap_handler(c, PIF_TAP, AF_INET6,
>  						     &seq->saddr, &seq->daddr,
> -						     p, k, now);
> +						     seq->hop_limit, p, k, now);
>  		}
>  	}
>  
> diff --git a/udp.c b/udp.c
> index 39431d7..618a4e2 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -849,6 +849,7 @@ fail:
>   * @af:		Address family, AF_INET or AF_INET6
>   * @saddr:	Source address
>   * @daddr:	Destination address
> + * @ttl:	TTL or hop limit for packets to be sent in this call
>   * @p:		Pool of UDP packets, with UDP headers
>   * @idx:	Index of first packet to process
>   * @now:	Current timestamp
> @@ -859,7 +860,8 @@ fail:
>   */
>  int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		    sa_family_t af, const void *saddr, const void *daddr,
> -		    const struct pool *p, int idx, const struct timespec *now)
> +		    uint8_t ttl, const struct pool *p, int idx,
> +		    const struct timespec *now)
>  {
>  	const struct flowside *toside;
>  	struct mmsghdr mm[UIO_MAXIOV];
> @@ -938,6 +940,21 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		mm[i].msg_hdr.msg_controllen = 0;
>  		mm[i].msg_hdr.msg_flags = 0;
>  
> +		if (ttl != uflow->ttl[tosidx.sidei]) {
> +			uflow->ttl[tosidx.sidei] = ttl;
> +			if (af == AF_INET) {
> +				if (setsockopt(s, IPPROTO_IP, IP_TTL,
> +					       &ttl, sizeof(ttl)) < 0)
> +					flow_perror(uflow,
> +						    "setsockopt IP_TTL");
> +			} else {
> +				if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
> +					       &ttl, sizeof(ttl)) < 0)
> +					flow_perror(uflow,
> +						    "setsockopt IPV6_HOPLIMIT");
> +			}
> +		}
> +
>  		count++;
>  	}
>  
> diff --git a/udp.h b/udp.h
> index de2df6d..a811475 100644
> --- a/udp.h
> +++ b/udp.h
> @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
>  			    uint32_t events, const struct timespec *now);
>  int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		    sa_family_t af, const void *saddr, const void *daddr,
> -		    const struct pool *p, int idx, const struct timespec *now);
> +		    uint8_t ttl, const struct pool *p, int idx,
> +		    const struct timespec *now);
>  int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
>  		  const char *ifname, in_port_t port);
>  int udp_init(struct ctx *c);
> diff --git a/udp_flow.c b/udp_flow.c
> index bf4b896..39372c2 100644
> --- a/udp_flow.c
> +++ b/udp_flow.c
> @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
>  	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
>  	uflow->ts = now->tv_sec;
>  	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
> +	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
>  
>  	if (s_ini >= 0) {
>  		/* When using auto port-scanning the listening port could go
> diff --git a/udp_flow.h b/udp_flow.h
> index 9a1b059..520de62 100644
> --- a/udp_flow.h
> +++ b/udp_flow.h
> @@ -8,11 +8,12 @@
>  #define UDP_FLOW_H
>  
>  /**
> - * struct udp - Descriptor for a flow of UDP packets
> + * struct udp_flow - Descriptor for a flow of UDP packets
>   * @f:		Generic flow information
>   * @closed:	Flow is already closed
>   * @ts:		Activity timestamp
>   * @s:		Socket fd (or -1) for each side of the flow
> + * @ttl:	TTL or hop_limit for both sides
>   */
>  struct udp_flow {
>  	/* Must be first element */
> @@ -21,6 +22,7 @@ struct udp_flow {
>  	bool closed :1;
>  	time_t ts;
>  	int s[SIDES];
> +	uint8_t ttl[SIDES];
>  };
>  
>  struct udp_flow *udp_at_sidx(flow_sidx_t sidx);

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] udp: support traceroute
  2025-04-03 23:40 ` David Gibson
@ 2025-04-04  0:57   ` David Gibson
  2025-04-04 11:01     ` Stefano Brivio
  0 siblings, 1 reply; 5+ messages in thread
From: David Gibson @ 2025-04-04  0:57 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 10682 bytes --]

On Fri, Apr 04, 2025 at 10:40:27AM +1100, David Gibson wrote:
> On Thu, Apr 03, 2025 at 06:27:06PM -0400, Jon Maloy wrote:
> > Now that ICMP pass-through from socket-to-tap is in place, it is
> > easy to support UDP based traceroute functionality in direction
> > tap-to-socket.
> > 
> > We fix that  in this commit.
> > 
> > Link: https://bugs.passt.top/show_bug.cgi?id=64
> > Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> 
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> 
> One commont below.

Oh.. wait.  I just realised this patch has a weird side effect, for
flows which are initiated from outside, rather than from the guest.

If the flow is initiated from outside, it's maybe a bit unlikely, but
we could still get a non-default TTL from the guest on reply
datagrams.  That will trigger this code and alter the socket's TTL.
But in this case the socket is not a flow specific socket, but the
listening socket which initiated the flow, which could be handling
packets on many flows.

The "cached" TTL is stored per-flow, not per-socket, so we won't look
at the right value when we process datagrams from other flows, so they
may go out with the wrong TTL.

I think the obvious way to address this is to stop using the
"listening" socket to send datagrams for a flow, using a connect()ed
socket instead.  We have other reasons to do that too, and I'm working
on implementing that right now.

The question is whether this is a serious enough problem to delay this
until the "both sides connect()ed sockets" change is merged.
Mitigating the problem we have:

 * Actually having multiple flows on a socket isn't super common (we
   handled this completely wrong before UDP flow table support, and
   no-one much noticed)
 * Non-default TTL packets on replies from the guest are probably
   uncommon
 * Wrong TTL packets sent out on different flows probably aren't
   particularly damaging in most cases

> > ---
> > v2: - Using ancillary data instead of setsockopt to transfer outgoing
> >       TTL.
> >     - Support IPv6
> > v3: - Storing ttl per packet instead of per flow. This may not be
> >       elegant, but much less intrusive than changing the flow
> >       criteria. This eliminates the need for the extra, flow-changing
> >       patch we introduced in v2.
> > v4: - Going back to something similar to the original solution, but
> >       storing current ttl in struct udp_flow, plus ensuring that all
> >       packets in a struct tap4_l4_t/tap6_l4_t instance have the same
> >       ttl. After input from David Gibson.
> > v5: - Some minor fixes after feedback from Stefano Brivio.
> > ---
> >  packet.h   |  2 ++
> >  tap.c      | 17 +++++++++++++----
> >  udp.c      | 19 ++++++++++++++++++-
> >  udp.h      |  3 ++-
> >  udp_flow.c |  1 +
> >  udp_flow.h |  4 +++-
> >  6 files changed, 39 insertions(+), 7 deletions(-)
> > 
> > diff --git a/packet.h b/packet.h
> > index c94780a..e84e123 100644
> > --- a/packet.h
> > +++ b/packet.h
> > @@ -11,6 +11,8 @@
> >  /* Maximum size of a single packet stored in pool, including headers */
> >  #define PACKET_MAX_LEN	((size_t)UINT16_MAX)
> >  
> > +#define DEFAULT_TTL 64
> 
> This is still fixed, rather than either probing the sysctl or using
> getsockopt() to determine the initial value.  I don't think we want to
> delay this further to change that, but it could be a reasonable
> follow up improvement.
> 
> > +
> >  /**
> >   * struct pool - Generic pool of packets stored in a buffer
> >   * @buf:	Buffer storing packet descriptors,
> > diff --git a/tap.c b/tap.c
> > index 3a6fcbe..d630f6d 100644
> > --- a/tap.c
> > +++ b/tap.c
> > @@ -559,6 +559,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
> >   * struct l4_seq4_t - Message sequence for one protocol handler call, IPv4
> >   * @msgs:	Count of messages in sequence
> >   * @protocol:	Protocol number
> > + * @ttl:	Time to live
> >   * @source:	Source port
> >   * @dest:	Destination port
> >   * @saddr:	Source address
> > @@ -567,6 +568,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
> >   */
> >  static struct tap4_l4_t {
> >  	uint8_t protocol;
> > +	uint8_t ttl;
> >  
> >  	uint16_t source;
> >  	uint16_t dest;
> > @@ -586,6 +588,7 @@ static struct tap4_l4_t {
> >   * @dest:	Destination port
> >   * @saddr:	Source address
> >   * @daddr:	Destination address
> > + * @hop_limit:	Hop limit
> >   * @msg:	Array of messages that can be handled in a single call
> >   */
> >  static struct tap6_l4_t {
> > @@ -598,6 +601,8 @@ static struct tap6_l4_t {
> >  	struct in6_addr saddr;
> >  	struct in6_addr daddr;
> >  
> > +	uint8_t hop_limit;
> > +
> >  	struct pool_l4_t p;
> >  } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */];
> >  
> > @@ -786,7 +791,8 @@ resume:
> >  #define L4_MATCH(iph, uh, seq)							\
> >  	((seq)->protocol == (iph)->protocol &&					\
> >  	 (seq)->source   == (uh)->source    && (seq)->dest  == (uh)->dest &&	\
> > -	 (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr)
> > +	 (seq)->saddr.s_addr == (iph)->saddr &&				\
> > +	 (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl)
> >  
> >  #define L4_SET(iph, uh, seq)						\
> >  	do {								\
> > @@ -795,6 +801,7 @@ resume:
> >  		(seq)->dest		= (uh)->dest;			\
> >  		(seq)->saddr.s_addr	= (iph)->saddr;			\
> >  		(seq)->daddr.s_addr	= (iph)->daddr;			\
> > +		(seq)->ttl		= (iph)->ttl;			\
> >  	} while (0)
> >  
> >  		if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV)
> > @@ -843,7 +850,7 @@ append:
> >  			for (k = 0; k < p->count; )
> >  				k += udp_tap_handler(c, PIF_TAP, AF_INET,
> >  						     &seq->saddr, &seq->daddr,
> > -						     p, k, now);
> > +						     seq->ttl, p, k, now);
> >  		}
> >  	}
> >  
> > @@ -966,7 +973,8 @@ resume:
> >  		 (seq)->dest == (uh)->dest                 &&		\
> >  		 (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) &&		\
> >  		 IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr)  &&		\
> > -		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr))
> > +		 IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)  &&		\
> > +		 (seq)->hop_limit == (ip6h)->hop_limit)
> >  
> >  #define L4_SET(ip6h, proto, uh, seq)					\
> >  	do {								\
> > @@ -976,6 +984,7 @@ resume:
> >  		(seq)->flow_lbl	= ip6_get_flow_lbl(ip6h);		\
> >  		(seq)->saddr	= *saddr;				\
> >  		(seq)->daddr	= *daddr;				\
> > +		(seq)->hop_limit = (ip6h)->hop_limit;			\
> >  	} while (0)
> >  
> >  		if (seq && L4_MATCH(ip6h, proto, uh, seq) &&
> > @@ -1026,7 +1035,7 @@ append:
> >  			for (k = 0; k < p->count; )
> >  				k += udp_tap_handler(c, PIF_TAP, AF_INET6,
> >  						     &seq->saddr, &seq->daddr,
> > -						     p, k, now);
> > +						     seq->hop_limit, p, k, now);
> >  		}
> >  	}
> >  
> > diff --git a/udp.c b/udp.c
> > index 39431d7..618a4e2 100644
> > --- a/udp.c
> > +++ b/udp.c
> > @@ -849,6 +849,7 @@ fail:
> >   * @af:		Address family, AF_INET or AF_INET6
> >   * @saddr:	Source address
> >   * @daddr:	Destination address
> > + * @ttl:	TTL or hop limit for packets to be sent in this call
> >   * @p:		Pool of UDP packets, with UDP headers
> >   * @idx:	Index of first packet to process
> >   * @now:	Current timestamp
> > @@ -859,7 +860,8 @@ fail:
> >   */
> >  int udp_tap_handler(const struct ctx *c, uint8_t pif,
> >  		    sa_family_t af, const void *saddr, const void *daddr,
> > -		    const struct pool *p, int idx, const struct timespec *now)
> > +		    uint8_t ttl, const struct pool *p, int idx,
> > +		    const struct timespec *now)
> >  {
> >  	const struct flowside *toside;
> >  	struct mmsghdr mm[UIO_MAXIOV];
> > @@ -938,6 +940,21 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
> >  		mm[i].msg_hdr.msg_controllen = 0;
> >  		mm[i].msg_hdr.msg_flags = 0;
> >  
> > +		if (ttl != uflow->ttl[tosidx.sidei]) {
> > +			uflow->ttl[tosidx.sidei] = ttl;
> > +			if (af == AF_INET) {
> > +				if (setsockopt(s, IPPROTO_IP, IP_TTL,
> > +					       &ttl, sizeof(ttl)) < 0)
> > +					flow_perror(uflow,
> > +						    "setsockopt IP_TTL");
> > +			} else {
> > +				if (setsockopt(s, IPPROTO_IPV6, IPV6_HOPLIMIT,
> > +					       &ttl, sizeof(ttl)) < 0)
> > +					flow_perror(uflow,
> > +						    "setsockopt IPV6_HOPLIMIT");
> > +			}
> > +		}
> > +
> >  		count++;
> >  	}
> >  
> > diff --git a/udp.h b/udp.h
> > index de2df6d..a811475 100644
> > --- a/udp.h
> > +++ b/udp.h
> > @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
> >  			    uint32_t events, const struct timespec *now);
> >  int udp_tap_handler(const struct ctx *c, uint8_t pif,
> >  		    sa_family_t af, const void *saddr, const void *daddr,
> > -		    const struct pool *p, int idx, const struct timespec *now);
> > +		    uint8_t ttl, const struct pool *p, int idx,
> > +		    const struct timespec *now);
> >  int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
> >  		  const char *ifname, in_port_t port);
> >  int udp_init(struct ctx *c);
> > diff --git a/udp_flow.c b/udp_flow.c
> > index bf4b896..39372c2 100644
> > --- a/udp_flow.c
> > +++ b/udp_flow.c
> > @@ -137,6 +137,7 @@ static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
> >  	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
> >  	uflow->ts = now->tv_sec;
> >  	uflow->s[INISIDE] = uflow->s[TGTSIDE] = -1;
> > +	uflow->ttl[INISIDE] = uflow->ttl[TGTSIDE] = DEFAULT_TTL;
> >  
> >  	if (s_ini >= 0) {
> >  		/* When using auto port-scanning the listening port could go
> > diff --git a/udp_flow.h b/udp_flow.h
> > index 9a1b059..520de62 100644
> > --- a/udp_flow.h
> > +++ b/udp_flow.h
> > @@ -8,11 +8,12 @@
> >  #define UDP_FLOW_H
> >  
> >  /**
> > - * struct udp - Descriptor for a flow of UDP packets
> > + * struct udp_flow - Descriptor for a flow of UDP packets
> >   * @f:		Generic flow information
> >   * @closed:	Flow is already closed
> >   * @ts:		Activity timestamp
> >   * @s:		Socket fd (or -1) for each side of the flow
> > + * @ttl:	TTL or hop_limit for both sides
> >   */
> >  struct udp_flow {
> >  	/* Must be first element */
> > @@ -21,6 +22,7 @@ struct udp_flow {
> >  	bool closed :1;
> >  	time_t ts;
> >  	int s[SIDES];
> > +	uint8_t ttl[SIDES];
> >  };
> >  
> >  struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
> 



-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] udp: support traceroute
  2025-04-04  0:57   ` David Gibson
@ 2025-04-04 11:01     ` Stefano Brivio
  2025-04-04 11:11       ` David Gibson
  0 siblings, 1 reply; 5+ messages in thread
From: Stefano Brivio @ 2025-04-04 11:01 UTC (permalink / raw)
  To: David Gibson; +Cc: Jon Maloy, passt-dev

On Fri, 4 Apr 2025 11:57:58 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> On Fri, Apr 04, 2025 at 10:40:27AM +1100, David Gibson wrote:
> > On Thu, Apr 03, 2025 at 06:27:06PM -0400, Jon Maloy wrote:  
> > > Now that ICMP pass-through from socket-to-tap is in place, it is
> > > easy to support UDP based traceroute functionality in direction
> > > tap-to-socket.
> > > 
> > > We fix that  in this commit.
> > > 
> > > Link: https://bugs.passt.top/show_bug.cgi?id=64
> > > Signed-off-by: Jon Maloy <jmaloy@redhat.com>  
> > 
> > Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> > 
> > One commont below.  
> 
> Oh.. wait.  I just realised this patch has a weird side effect, for
> flows which are initiated from outside, rather than from the guest.
> 
> If the flow is initiated from outside, it's maybe a bit unlikely, but
> we could still get a non-default TTL from the guest on reply
> datagrams.  That will trigger this code and alter the socket's TTL.
> But in this case the socket is not a flow specific socket, but the
> listening socket which initiated the flow, which could be handling
> packets on many flows.
> 
> The "cached" TTL is stored per-flow, not per-socket, so we won't look
> at the right value when we process datagrams from other flows, so they
> may go out with the wrong TTL.
> 
> I think the obvious way to address this is to stop using the
> "listening" socket to send datagrams for a flow, using a connect()ed
> socket instead.  We have other reasons to do that too, and I'm working
> on implementing that right now.
> 
> The question is whether this is a serious enough problem to delay this
> until the "both sides connect()ed sockets" change is merged.

On the other hand, this patch applies cleanly on top of your "Use
connect()ed sockets for both sides of UDP flows" series, and also
semantically I couldn't spot any particular change or integration
that we would need as a result in this patch.

I haven't reviewed the series yet, but I don't think that an eventual
re-spin would change this, so... problem solved I guess? I can just
wait and apply them together.

-- 
Stefano


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] udp: support traceroute
  2025-04-04 11:01     ` Stefano Brivio
@ 2025-04-04 11:11       ` David Gibson
  0 siblings, 0 replies; 5+ messages in thread
From: David Gibson @ 2025-04-04 11:11 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Jon Maloy, passt-dev

[-- Attachment #1: Type: text/plain, Size: 2641 bytes --]

On Fri, Apr 04, 2025 at 01:01:45PM +0200, Stefano Brivio wrote:
> On Fri, 4 Apr 2025 11:57:58 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > On Fri, Apr 04, 2025 at 10:40:27AM +1100, David Gibson wrote:
> > > On Thu, Apr 03, 2025 at 06:27:06PM -0400, Jon Maloy wrote:  
> > > > Now that ICMP pass-through from socket-to-tap is in place, it is
> > > > easy to support UDP based traceroute functionality in direction
> > > > tap-to-socket.
> > > > 
> > > > We fix that  in this commit.
> > > > 
> > > > Link: https://bugs.passt.top/show_bug.cgi?id=64
> > > > Signed-off-by: Jon Maloy <jmaloy@redhat.com>  
> > > 
> > > Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> > > 
> > > One commont below.  
> > 
> > Oh.. wait.  I just realised this patch has a weird side effect, for
> > flows which are initiated from outside, rather than from the guest.
> > 
> > If the flow is initiated from outside, it's maybe a bit unlikely, but
> > we could still get a non-default TTL from the guest on reply
> > datagrams.  That will trigger this code and alter the socket's TTL.
> > But in this case the socket is not a flow specific socket, but the
> > listening socket which initiated the flow, which could be handling
> > packets on many flows.
> > 
> > The "cached" TTL is stored per-flow, not per-socket, so we won't look
> > at the right value when we process datagrams from other flows, so they
> > may go out with the wrong TTL.
> > 
> > I think the obvious way to address this is to stop using the
> > "listening" socket to send datagrams for a flow, using a connect()ed
> > socket instead.  We have other reasons to do that too, and I'm working
> > on implementing that right now.
> > 
> > The question is whether this is a serious enough problem to delay this
> > until the "both sides connect()ed sockets" change is merged.
> 
> On the other hand, this patch applies cleanly on top of your "Use
> connect()ed sockets for both sides of UDP flows" series, and also
> semantically I couldn't spot any particular change or integration
> that we would need as a result in this patch.

Ah.. yeah.  When I wrote the above I didn't think I'd finish that
series today :).

> I haven't reviewed the series yet, but I don't think that an eventual
> re-spin would change this, so... problem solved I guess? I can just
> wait and apply them together.

Yeah, I guess so.  Hooray!

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-04-04 11:11 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-04-03 22:27 [PATCH v5] udp: support traceroute Jon Maloy
2025-04-03 23:40 ` David Gibson
2025-04-04  0:57   ` David Gibson
2025-04-04 11:01     ` Stefano Brivio
2025-04-04 11:11       ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).