[PATCH v3] udp: support traceroute

public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed

* [PATCH v3] udp: support traceroute
@ 2025-03-30 21:06 Jon Maloy
  2025-03-31  5:23 ` David Gibson
  0 siblings, 1 reply; 2+ messages in thread
From: Jon Maloy @ 2025-03-30 21:06 UTC (permalink / raw)
  To: passt-dev, sbrivio, lvivier, dgibson, jmaloy

Now that ICMP pass-through from socket-to-tap is in place, it is
easy to support UDP based traceroute functionality in direction
tap-to-socket.

We fix that in this commit.

Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v2: - Using ancillary data instead of setsockopt to transfer outgoing
      TTL.
    - Support IPv6
v3: - Storing ttl per packet instead of per flow. This may not be
      elegant, but much less intrusive than changing the flow
      criteria. This eliminates the need for the extra, flow-changing
      patch we introduced in v2.
---
 packet.c | 28 +++++++++++++++++-----------
 packet.h | 30 ++++++++++++++++++++++--------
 tap.c    |  3 ++-
 udp.c    | 28 ++++++++++++++++++++++++----
 udp.h    |  3 ++-
 5 files changed, 67 insertions(+), 25 deletions(-)

diff --git a/packet.c b/packet.c
index 72c6158..36a32fe 100644
--- a/packet.c
+++ b/packet.c
@@ -89,11 +89,12 @@ bool pool_full(const struct pool *p)
  * @p:		Existing pool
  * @len:	Length of new descriptor
  * @start:	Start of data
+ * @ttl:	TTL/hop_limit for this packet
  * @func:	For tracing: name of calling function
  * @line:	For tracing: caller line of function call
  */
 void packet_add_do(struct pool *p, size_t len, const char *start,
-		   const char *func, int line)
+		   const uint8_t ttl, const char *func, int line)
 {
 	size_t idx = p->count;
 
@@ -106,8 +107,9 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
 	if (packet_check_range(p, start, len, func, line))
 		return;
 
-	p->pkt[idx].iov_base = (void *)start;
-	p->pkt[idx].iov_len = len;
+	p->pkt[idx].iov.iov_base = (void *)start;
+	p->pkt[idx].iov.iov_len = len;
+	p->pkt[idx].ttl = ttl;
 
 	p->count++;
 }
@@ -125,7 +127,8 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
  * Return: pointer to start of data range, NULL on invalid range or descriptor
  */
 void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
-			size_t len, size_t *left, const char *func, int line)
+			size_t len, size_t *left, uint8_t *ttl,
+			const char *func, int line)
 {
 	char *ptr;
 
@@ -139,18 +142,21 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
 		return NULL;
 	}
 
-	if (offset > p->pkt[idx].iov_len ||
-	    len > (p->pkt[idx].iov_len - offset))
+	if (offset > p->pkt[idx].iov.iov_len ||
+	    len > (p->pkt[idx].iov.iov_len - offset))
 		return NULL;
 
-	ptr = (char *)p->pkt[idx].iov_base + offset;
+	ptr = (char *)p->pkt[idx].iov.iov_base + offset;
 
 	ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line),
 			"Corrupt packet pool, %s:%i", func, line);
 
 	if (left)
-		*left = p->pkt[idx].iov_len - offset - len;
+		*left = p->pkt[idx].iov.iov_len - offset - len;
 
+	if (ttl)
+		*ttl =  p->pkt[idx].ttl;
+;
 	return ptr;
 }
 
@@ -168,14 +174,14 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
  */
 void *packet_get_do(const struct pool *p, const size_t idx,
 		    size_t offset, size_t len, size_t *left,
-		    const char *func, int line)
+		    uint8_t *ttl, const char *func, int line)
 {
-	void *r = packet_get_try_do(p, idx, offset, len, left, func, line);
+	void *r = packet_get_try_do(p, idx, offset, len, left, ttl, func, line);
 
 	if (!r) {
 		trace("missing packet data length %zu, offset %zu from "
 		      "length %zu, %s:%i",
-		      len, offset, p->pkt[idx].iov_len, func, line);
+		      len, offset, p->pkt[idx].iov.iov_len, func, line);
 	}
 
 	return r;
diff --git a/packet.h b/packet.h
index c94780a..1f5142c 100644
--- a/packet.h
+++ b/packet.h
@@ -11,6 +11,8 @@
 /* Maximum size of a single packet stored in pool, including headers */
 #define PACKET_MAX_LEN	((size_t)UINT16_MAX)
 
+#define DEFAULT_TTL 64
+
 /**
  * struct pool - Generic pool of packets stored in a buffer
  * @buf:	Buffer storing packet descriptors,
@@ -26,28 +28,36 @@ struct pool {
 	size_t buf_size;
 	size_t size;
 	size_t count;
-	struct iovec pkt[];
+	struct {
+		struct iovec iov;
+		uint8_t ttl;
+		uint8_t pad[3];
+	} pkt[];
 };
 
 int vu_packet_check_range(void *buf, const char *ptr, size_t len);
 void packet_add_do(struct pool *p, size_t len, const char *start,
-		   const char *func, int line);
+		   const uint8_t ttl, const char *func, int line);
 void *packet_get_try_do(const struct pool *p, const size_t idx,
 			size_t offset, size_t len, size_t *left,
-			const char *func, int line);
+			uint8_t *ttl, const char *func, int line);
 void *packet_get_do(const struct pool *p, const size_t idx,
 		    size_t offset, size_t len, size_t *left,
-		    const char *func, int line);
+		    uint8_t *ttl, const char *func, int line);
 bool pool_full(const struct pool *p);
 void pool_flush(struct pool *p);
 
 #define packet_add(p, len, start)					\
-	packet_add_do(p, len, start, __func__, __LINE__)
+	packet_add_do(p, len, start, DEFAULT_TTL, __func__, __LINE__)
+#define packet_add_ttl(p, len, start, ttl)					\
+	packet_add_do(p, len, start, ttl, __func__, __LINE__)
 
 #define packet_get_try(p, idx, offset, len, left)			\
-	packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__)
+	packet_get_try_do(p, idx, offset, len, left, NULL, __func__, __LINE__)
 #define packet_get(p, idx, offset, len, left)				\
-	packet_get_do(p, idx, offset, len, left, __func__, __LINE__)
+	packet_get_do(p, idx, offset, len, left, NULL, __func__, __LINE__)
+#define packet_get_ttl(p, idx, offset, len, left, ttl)				\
+	packet_get_do(p, idx, offset, len, left, ttl, __func__, __LINE__)
 
 #define PACKET_POOL_DECL(_name, _size, _buf)				\
 struct _name ## _t {							\
@@ -55,7 +65,11 @@ struct _name ## _t {							\
 	size_t buf_size;						\
 	size_t size;							\
 	size_t count;							\
-	struct iovec pkt[_size];					\
+	struct {							\
+	struct iovec iov;						\
+		uint8_t ttl;						\
+		uint8_t pad[3];						\
+	} pkt[_size];							\
 }
 
 #define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size)			\
diff --git a/tap.c b/tap.c
index 3a6fcbe..ac9b3df 100644
--- a/tap.c
+++ b/tap.c
@@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
  * @dest:	Destination port
  * @saddr:	Source address
  * @daddr:	Destination address
+ * @ttl:	TTL/hop_limit for packet
  * @msg:	Array of messages that can be handled in a single call
  */
 static struct tap4_l4_t {
@@ -821,7 +822,7 @@ resume:
 #undef L4_SET
 
 append:
-		packet_add((struct pool *)&seq->p, l4len, l4h);
+		packet_add_ttl((struct pool *)&seq->p, l4len, l4h, iph->ttl);
 	}
 
 	for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) {
diff --git a/udp.c b/udp.c
index 39431d7..5fbba49 100644
--- a/udp.c
+++ b/udp.c
@@ -859,8 +859,10 @@ fail:
  */
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now)
+		    const struct pool *p, int idx,
+		    const struct timespec *now)
 {
+	char ancillary[CMSG_SPACE(sizeof(int))];
 	const struct flowside *toside;
 	struct mmsghdr mm[UIO_MAXIOV];
 	union sockaddr_inany to_sa;
@@ -885,7 +887,9 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 	src = ntohs(uh->source);
 	dst = ntohs(uh->dest);
 
-	tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst, now);
+	tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr,
+				   src, dst, now);
+
 	if (!(uflow = udp_at_sidx(tosidx))) {
 		char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN];
 
@@ -915,8 +919,9 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 	for (i = 0; i < (int)p->count - idx; i++) {
 		struct udphdr *uh_send;
 		size_t len;
+		uint8_t ttl;
 
-		uh_send = packet_get(p, idx + i, 0, sizeof(*uh), &len);
+		uh_send = packet_get_ttl(p, idx + i, 0, sizeof(*uh), &len, &ttl);
 		if (!uh_send)
 			return p->count - idx;
 
@@ -926,7 +931,6 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		if (len) {
 			m[i].iov_base = (char *)(uh_send + 1);
 			m[i].iov_len = len;
-
 			mm[i].msg_hdr.msg_iov = m + i;
 			mm[i].msg_hdr.msg_iovlen = 1;
 		} else {
@@ -938,6 +942,22 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		mm[i].msg_hdr.msg_controllen = 0;
 		mm[i].msg_hdr.msg_flags = 0;
 
+		if (ttl != DEFAULT_TTL) {
+			struct cmsghdr *cmsg = (void *) ancillary;
+
+			if (af == AF_INET) {
+				cmsg->cmsg_level = IPPROTO_IP;
+				cmsg->cmsg_type = IP_TTL;
+			} else {
+				cmsg->cmsg_level = IPPROTO_IPV6;
+				cmsg->cmsg_type = IPV6_HOPLIMIT;
+			}
+			cmsg->cmsg_len = CMSG_LEN(sizeof(int));
+			*((int *) CMSG_DATA(cmsg)) = ttl;
+			mm[i].msg_hdr.msg_control = ancillary;
+			mm[i].msg_hdr.msg_controllen = sizeof(ancillary);
+		}
+
 		count++;
 	}
 
diff --git a/udp.h b/udp.h
index de2df6d..6adbfcd 100644
--- a/udp.h
+++ b/udp.h
@@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
 			    uint32_t events, const struct timespec *now);
 int udp_tap_handler(const struct ctx *c, uint8_t pif,
 		    sa_family_t af, const void *saddr, const void *daddr,
-		    const struct pool *p, int idx, const struct timespec *now);
+		    const struct pool *p, int idx,
+		    const struct timespec *now);
 int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
 		  const char *ifname, in_port_t port);
 int udp_init(struct ctx *c);
-- 
2.48.1


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH v3] udp: support traceroute
  2025-03-30 21:06 [PATCH v3] udp: support traceroute Jon Maloy
@ 2025-03-31  5:23 ` David Gibson
  0 siblings, 0 replies; 2+ messages in thread
From: David Gibson @ 2025-03-31  5:23 UTC (permalink / raw)
  To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson

[-- Attachment #1: Type: text/plain, Size: 12971 bytes --]

On Sun, Mar 30, 2025 at 05:06:28PM -0400, Jon Maloy wrote:
> Now that ICMP pass-through from socket-to-tap is in place, it is
> easy to support UDP based traceroute functionality in direction
> tap-to-socket.
> 
> We fix that in this commit.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
> v2: - Using ancillary data instead of setsockopt to transfer outgoing
>       TTL.
>     - Support IPv6
> v3: - Storing ttl per packet instead of per flow. This may not be
>       elegant, but much less intrusive than changing the flow
>       criteria. This eliminates the need for the extra, flow-changing
>       patch we introduced in v2.

Hm, I think this will work, but I'm not entirely convinced about the
approach.

> ---
>  packet.c | 28 +++++++++++++++++-----------
>  packet.h | 30 ++++++++++++++++++++++--------
>  tap.c    |  3 ++-
>  udp.c    | 28 ++++++++++++++++++++++++----
>  udp.h    |  3 ++-
>  5 files changed, 67 insertions(+), 25 deletions(-)
> 
> diff --git a/packet.c b/packet.c
> index 72c6158..36a32fe 100644
> --- a/packet.c
> +++ b/packet.c
> @@ -89,11 +89,12 @@ bool pool_full(const struct pool *p)
>   * @p:		Existing pool
>   * @len:	Length of new descriptor
>   * @start:	Start of data
> + * @ttl:	TTL/hop_limit for this packet
>   * @func:	For tracing: name of calling function
>   * @line:	For tracing: caller line of function call
>   */
>  void packet_add_do(struct pool *p, size_t len, const char *start,
> -		   const char *func, int line)
> +		   const uint8_t ttl, const char *func, int line)
>  {
>  	size_t idx = p->count;
>  
> @@ -106,8 +107,9 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
>  	if (packet_check_range(p, start, len, func, line))
>  		return;
>  
> -	p->pkt[idx].iov_base = (void *)start;
> -	p->pkt[idx].iov_len = len;
> +	p->pkt[idx].iov.iov_base = (void *)start;
> +	p->pkt[idx].iov.iov_len = len;
> +	p->pkt[idx].ttl = ttl;
>  
>  	p->count++;
>  }
> @@ -125,7 +127,8 @@ void packet_add_do(struct pool *p, size_t len, const char *start,
>   * Return: pointer to start of data range, NULL on invalid range or descriptor
>   */
>  void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
> -			size_t len, size_t *left, const char *func, int line)
> +			size_t len, size_t *left, uint8_t *ttl,
> +			const char *func, int line)
>  {
>  	char *ptr;
>  
> @@ -139,18 +142,21 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
>  		return NULL;
>  	}
>  
> -	if (offset > p->pkt[idx].iov_len ||
> -	    len > (p->pkt[idx].iov_len - offset))
> +	if (offset > p->pkt[idx].iov.iov_len ||
> +	    len > (p->pkt[idx].iov.iov_len - offset))
>  		return NULL;
>  
> -	ptr = (char *)p->pkt[idx].iov_base + offset;
> +	ptr = (char *)p->pkt[idx].iov.iov_base + offset;
>  
>  	ASSERT_WITH_MSG(!packet_check_range(p, ptr, len, func, line),
>  			"Corrupt packet pool, %s:%i", func, line);
>  
>  	if (left)
> -		*left = p->pkt[idx].iov_len - offset - len;
> +		*left = p->pkt[idx].iov.iov_len - offset - len;
>  
> +	if (ttl)
> +		*ttl =  p->pkt[idx].ttl;
> +;
>  	return ptr;
>  }
>  
> @@ -168,14 +174,14 @@ void *packet_get_try_do(const struct pool *p, size_t idx, size_t offset,
>   */
>  void *packet_get_do(const struct pool *p, const size_t idx,
>  		    size_t offset, size_t len, size_t *left,
> -		    const char *func, int line)
> +		    uint8_t *ttl, const char *func, int line)
>  {
> -	void *r = packet_get_try_do(p, idx, offset, len, left, func, line);
> +	void *r = packet_get_try_do(p, idx, offset, len, left, ttl, func, line);
>  
>  	if (!r) {
>  		trace("missing packet data length %zu, offset %zu from "
>  		      "length %zu, %s:%i",
> -		      len, offset, p->pkt[idx].iov_len, func, line);
> +		      len, offset, p->pkt[idx].iov.iov_len, func, line);
>  	}
>  
>  	return r;
> diff --git a/packet.h b/packet.h
> index c94780a..1f5142c 100644
> --- a/packet.h
> +++ b/packet.h
> @@ -11,6 +11,8 @@
>  /* Maximum size of a single packet stored in pool, including headers */
>  #define PACKET_MAX_LEN	((size_t)UINT16_MAX)
>  
> +#define DEFAULT_TTL 64

This appears to be the default default on Linux, but looks like it can
be changed by sysctl.

>  /**
>   * struct pool - Generic pool of packets stored in a buffer
>   * @buf:	Buffer storing packet descriptors,
> @@ -26,28 +28,36 @@ struct pool {
>  	size_t buf_size;
>  	size_t size;
>  	size_t count;
> -	struct iovec pkt[];
> +	struct {
> +		struct iovec iov;
> +		uint8_t ttl;

Hm.  I'm not entirely sure I like the idea of putting the TTL
explicitly here, given that it is usually contained implicitly in the
IP headrs of the packets in the pool.  Then again, packet pool entries
are roughly like skbs, which do include a bunch of fields giving
"summary" information.

The alternative approach to this would be to add TTL to the criteria
for making "seqs" in tap.c.  That is update the logic for L4_MATCH and
L4_SET as in the earlier versions.  Then you can pass a single ttl for
a group of packets.  The "seqs" tend to roughly line up with flows,
but they don't have to.

> +		uint8_t pad[3];

I don't think you should need to explicitly pad, the compiler should
do that automatically.  On 64-bit systems (i.e. most of them), I think
you'd need 7 bytes of padding to keep things properly aligned.

> +	} pkt[];
>  };
>  
>  int vu_packet_check_range(void *buf, const char *ptr, size_t len);
>  void packet_add_do(struct pool *p, size_t len, const char *start,
> -		   const char *func, int line);
> +		   const uint8_t ttl, const char *func, int line);
>  void *packet_get_try_do(const struct pool *p, const size_t idx,
>  			size_t offset, size_t len, size_t *left,
> -			const char *func, int line);
> +			uint8_t *ttl, const char *func, int line);
>  void *packet_get_do(const struct pool *p, const size_t idx,
>  		    size_t offset, size_t len, size_t *left,
> -		    const char *func, int line);
> +		    uint8_t *ttl, const char *func, int line);
>  bool pool_full(const struct pool *p);
>  void pool_flush(struct pool *p);
>  
>  #define packet_add(p, len, start)					\
> -	packet_add_do(p, len, start, __func__, __LINE__)
> +	packet_add_do(p, len, start, DEFAULT_TTL, __func__, __LINE__)
> +#define packet_add_ttl(p, len, start, ttl)					\
> +	packet_add_do(p, len, start, ttl, __func__, __LINE__)

I also don't love the fact that the ttl value in the pool will only
sometimes be correct: only in the paths where you actually bother to
parse it from the packet.

>  #define packet_get_try(p, idx, offset, len, left)			\
> -	packet_get_try_do(p, idx, offset, len, left, __func__, __LINE__)
> +	packet_get_try_do(p, idx, offset, len, left, NULL, __func__, __LINE__)
>  #define packet_get(p, idx, offset, len, left)				\
> -	packet_get_do(p, idx, offset, len, left, __func__, __LINE__)
> +	packet_get_do(p, idx, offset, len, left, NULL, __func__, __LINE__)
> +#define packet_get_ttl(p, idx, offset, len, left, ttl)				\
> +	packet_get_do(p, idx, offset, len, left, ttl, __func__, __LINE__)
>  
>  #define PACKET_POOL_DECL(_name, _size, _buf)				\
>  struct _name ## _t {							\
> @@ -55,7 +65,11 @@ struct _name ## _t {							\
>  	size_t buf_size;						\
>  	size_t size;							\
>  	size_t count;							\
> -	struct iovec pkt[_size];					\
> +	struct {							\
> +	struct iovec iov;						\
> +		uint8_t ttl;						\
> +		uint8_t pad[3];						\
> +	} pkt[_size];							\
>  }
>  
>  #define PACKET_POOL_INIT_NOCAST(_size, _buf, _buf_size)			\
> diff --git a/tap.c b/tap.c
> index 3a6fcbe..ac9b3df 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf);
>   * @dest:	Destination port
>   * @saddr:	Source address
>   * @daddr:	Destination address
> + * @ttl:	TTL/hop_limit for packet

This seems to be a leftover from an earlier version; you're no longer
adding a ttle field to tap_l4_t.

>   * @msg:	Array of messages that can be handled in a single call
>   */
>  static struct tap4_l4_t {
> @@ -821,7 +822,7 @@ resume:
>  #undef L4_SET
>  
>  append:
> -		packet_add((struct pool *)&seq->p, l4len, l4h);
> +		packet_add_ttl((struct pool *)&seq->p, l4len, l4h, iph->ttl);
>  	}
>  
>  	for (j = 0, seq = tap4_l4; j < seq_count; j++, seq++) {
> diff --git a/udp.c b/udp.c
> index 39431d7..5fbba49 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -859,8 +859,10 @@ fail:
>   */
>  int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		    sa_family_t af, const void *saddr, const void *daddr,
> -		    const struct pool *p, int idx, const struct timespec *now)
> +		    const struct pool *p, int idx,
> +		    const struct timespec *now)

Unrelated whitespace change.

>  {
> +	char ancillary[CMSG_SPACE(sizeof(int))];
>  	const struct flowside *toside;
>  	struct mmsghdr mm[UIO_MAXIOV];
>  	union sockaddr_inany to_sa;
> @@ -885,7 +887,9 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  	src = ntohs(uh->source);
>  	dst = ntohs(uh->dest);
>  
> -	tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst, now);
> +	tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr,
> +				   src, dst, now);
> +

Unrelated whitespace change.

>  	if (!(uflow = udp_at_sidx(tosidx))) {
>  		char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN];
>  
> @@ -915,8 +919,9 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  	for (i = 0; i < (int)p->count - idx; i++) {
>  		struct udphdr *uh_send;
>  		size_t len;
> +		uint8_t ttl;
>  
> -		uh_send = packet_get(p, idx + i, 0, sizeof(*uh), &len);
> +		uh_send = packet_get_ttl(p, idx + i, 0, sizeof(*uh), &len, &ttl);
>  		if (!uh_send)
>  			return p->count - idx;
>  
> @@ -926,7 +931,6 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		if (len) {
>  			m[i].iov_base = (char *)(uh_send + 1);
>  			m[i].iov_len = len;
> -

Unrelated whitespace change.

>  			mm[i].msg_hdr.msg_iov = m + i;
>  			mm[i].msg_hdr.msg_iovlen = 1;
>  		} else {
> @@ -938,6 +942,22 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		mm[i].msg_hdr.msg_controllen = 0;
>  		mm[i].msg_hdr.msg_flags = 0;
>  
> +		if (ttl != DEFAULT_TTL) {

I'm also not entirely convinced of the wisdom of making things
conditional on a check against a fixed DEFAULT_TTL.  If the guest has
its default TTL configured to something other than 64, this will
trigger on every packet.  It will also trigger every time (probably)
on any traffic that isn't directly from the guest, but forwarded by
the guest from some other interface (e.g. a VPN).  The TTL might have
started at 64 on the endpoint, but it will have decreased by some hops
before it reaches us.

Handling this sort of case gracefully, is why I had a different
approach in mind.  In struct udp_flow keep the current TTL for the
flow's socket.  Whenever we get a packet from tap with a TTL that's
different, setsockopt() and update the current socket TTL.  For cases
like the above where the TTL is not 64, but is uniform, we'll get a
single setsockopt() then not have to do anything more.  For traceroute
we'll get a setsockopt() per "distance" that traceroute probes.

Then again, it's possible that adding the cmsg to the existing syscall
is of negligible cost, whereas an extra syscall will have some cost,
even if it's only sometimes.  Not sure.

> +			struct cmsghdr *cmsg = (void *) ancillary;
> +
> +			if (af == AF_INET) {
> +				cmsg->cmsg_level = IPPROTO_IP;
> +				cmsg->cmsg_type = IP_TTL;
> +			} else {
> +				cmsg->cmsg_level = IPPROTO_IPV6;
> +				cmsg->cmsg_type = IPV6_HOPLIMIT;
> +			}
> +			cmsg->cmsg_len = CMSG_LEN(sizeof(int));
> +			*((int *) CMSG_DATA(cmsg)) = ttl;

I also wonder if we should be settil (ttl - 1) here: should we count
ourselves as a "hop"?

> +			mm[i].msg_hdr.msg_control = ancillary;
> +			mm[i].msg_hdr.msg_controllen = sizeof(ancillary);
> +		}
> +
>  		count++;
>  	}
>  
> diff --git a/udp.h b/udp.h
> index de2df6d..6adbfcd 100644
> --- a/udp.h
> +++ b/udp.h
> @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
>  			    uint32_t events, const struct timespec *now);
>  int udp_tap_handler(const struct ctx *c, uint8_t pif,
>  		    sa_family_t af, const void *saddr, const void *daddr,
> -		    const struct pool *p, int idx, const struct timespec *now);
> +		    const struct pool *p, int idx,
> +		    const struct timespec *now);
>  int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
>  		  const char *ifname, in_port_t port);
>  int udp_init(struct ctx *c);

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2025-03-31  5:25 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-03-30 21:06 [PATCH v3] udp: support traceroute Jon Maloy
2025-03-31  5:23 ` David Gibson

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).