* [PATCH] udp: create and send ICMPv4 to local peer when applicable
@ 2025-02-09 17:00 Jon Maloy
2025-02-10 10:27 ` Stefano Brivio
2025-02-11 0:55 ` David Gibson
0 siblings, 2 replies; 8+ messages in thread
From: Jon Maloy @ 2025-02-09 17:00 UTC (permalink / raw)
To: passt-dev, sbrivio, lvivier, dgibson, jmaloy
When a local peer sends a UDP message to a non-existing port on an
existing remote host, that host will return an ICMP message containing
the error code ICMP_PORT_UNREACH, plus the header and the first eight
bytes of the original message. If the sender socket has been connected,
it uses this message to issue a "Connection Refused" event to the user.
Until now, we have only read such events from the externally facing
socket, but we don't forward them back to the local sender because
we cannot read the ICMP message directly to user space. Because of
this, the local peer will hang and wait for a response that never
arrives.
We now fix this for IPv4 by recreating and forwarding a correct ICMP
message back to the internal sender. We synthesize the message based
on the information in the extended error structure, plus the returned
part of the original message body.
Note that for the sake of completeness, we even produce ICMP messages
for other error codes. We have noticed that at least ICMP_PROT_UNREACH
is propagated as an error event back to the user.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
tap.c | 4 +--
tap.h | 2 ++
udp.c | 71 ++++++++++++++++++++++++++++++++++++++++++--------
udp_internal.h | 2 +-
udp_vu.c | 4 +--
5 files changed, 67 insertions(+), 16 deletions(-)
diff --git a/tap.c b/tap.c
index cd32a90..f83aac5 100644
--- a/tap.c
+++ b/tap.c
@@ -142,8 +142,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
*
* Return: pointer at which to write the packet's payload
*/
-static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
- struct in_addr dst, size_t l4len, uint8_t proto)
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+ struct in_addr dst, size_t l4len, uint8_t proto)
{
uint16_t l3len = l4len + sizeof(*ip4h);
diff --git a/tap.h b/tap.h
index dfbd8b9..3ba00c1 100644
--- a/tap.h
+++ b/tap.h
@@ -44,6 +44,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
thdr->vnet_len = htonl(l2len);
}
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+ struct in_addr dst, size_t l4len, uint8_t proto);
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
struct in_addr dst, in_port_t dport,
const void *in, size_t dlen);
diff --git a/udp.c b/udp.c
index 923cc38..4b800b6 100644
--- a/udp.c
+++ b/udp.c
@@ -87,6 +87,7 @@
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/udp.h>
+#include <netinet/ip_icmp.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
@@ -402,25 +403,70 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
}
+/**
+ * udp_send_conn_fail_icmp4() - Construct and send ICMP to local peer
+ * @c: Execution context
+ * @ee: Extended error descriptor
+ * @ref: epoll reference
+ * @iov: First bytes (max 8) of original UDP message body
+ */
+static void udp_send_conn_fail_icmp4(const struct ctx *c,
+ const struct sock_extended_err *ee,
+ union epoll_ref ref,
+ struct iovec *iov)
+{
+ flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
+ const struct flowside *toside = flowside_at_sidx(tosidx);
+ struct in_addr oaddr = toside->oaddr.v4mapped.a4;
+ struct in_addr eaddr = toside->eaddr.v4mapped.a4;
+ struct iov_tail data = IOV_TAIL(iov, 1, 0);
+ struct {
+ struct icmphdr icmp4h;
+ struct iphdr ip4h;
+ struct udphdr uh;
+ char data[8];
+ } msg;
+ size_t udplen = sizeof(msg.uh) + iov->iov_len;
+ size_t msglen = sizeof(msg.icmp4h) + sizeof(msg.ip4h) + udplen;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.icmp4h.type = ee->ee_type;
+ msg.icmp4h.code = ee->ee_code;
+ tap_push_ip4h(&msg.ip4h, eaddr, oaddr, udplen, IPPROTO_UDP);
+ msg.uh.source = htons(toside->eport);
+ msg.uh.dest = htons(toside->oport);
+ msg.uh.len = htons(udplen);
+ csum_udp4(&msg.uh, oaddr, eaddr, &data);
+ memcpy(&msg.data, iov->iov_base, iov->iov_len);
+ tap_icmp4_send(c, oaddr, eaddr, &msg, msglen);
+}
+
/**
* udp_sock_recverr() - Receive and clear an error from a socket
- * @s: Socket to receive from
+ * @c: Execution context
+ * @ref: epoll reference
*
* Return: 1 if error received and processed, 0 if no more errors in queue, < 0
* if there was an error reading the queue
*
* #syscalls recvmsg
*/
-static int udp_sock_recverr(int s)
+static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
{
const struct sock_extended_err *ee;
const struct cmsghdr *hdr;
char buf[CMSG_SPACE(sizeof(*ee))];
+ char udp_data[8];
+ int s = ref.fd;
+ struct iovec iov = {
+ .iov_base = udp_data,
+ .iov_len = sizeof(udp_data)
+ };
struct msghdr mh = {
.msg_name = NULL,
.msg_namelen = 0,
- .msg_iov = NULL,
- .msg_iovlen = 0,
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
.msg_control = buf,
.msg_controllen = sizeof(buf),
};
@@ -450,8 +496,10 @@ static int udp_sock_recverr(int s)
}
ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
-
- /* TODO: When possible propagate and otherwise handle errors */
+ if (ee->ee_type == ICMP_DEST_UNREACH) {
+ iov.iov_len = rc;
+ udp_send_conn_fail_icmp4(c, ee, ref, &iov);
+ }
debug("%s error on UDP socket %i: %s",
str_ee_origin(ee), s, strerror_(ee->ee_errno));
@@ -461,15 +509,16 @@ static int udp_sock_recverr(int s)
/**
* udp_sock_errs() - Process errors on a socket
* @c: Execution context
- * @s: Socket to receive from
+ * @ref: epoll reference
* @events: epoll events bitmap
*
* Return: Number of errors handled, or < 0 if we have an unrecoverable error
*/
-int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
+int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
{
unsigned n_err = 0;
socklen_t errlen;
+ int s = ref.fd;
int rc, err;
ASSERT(!c->no_udp);
@@ -478,7 +527,7 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
return 0; /* Nothing to do */
/* Empty the error queue */
- while ((rc = udp_sock_recverr(s)) > 0)
+ while ((rc = udp_sock_recverr(c, ref)) > 0)
n_err += rc;
if (rc < 0)
@@ -558,7 +607,7 @@ static void udp_buf_listen_sock_handler(const struct ctx *c,
const socklen_t sasize = sizeof(udp_meta[0].s_in);
int n, i;
- if (udp_sock_errs(c, ref.fd, events) < 0) {
+ if (udp_sock_errs(c, ref, events) < 0) {
err("UDP: Unrecoverable error on listening socket:"
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
/* FIXME: what now? close/re-open socket? */
@@ -661,7 +710,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
from_s = uflow->s[ref.flowside.sidei];
- if (udp_sock_errs(c, from_s, events) < 0) {
+ if (udp_sock_errs(c, ref, events) < 0) {
flow_err(uflow, "Unrecoverable error on reply socket");
flow_err_details(uflow);
udp_flow_close(c, uflow);
diff --git a/udp_internal.h b/udp_internal.h
index cc80e30..3b081f5 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,5 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
const struct flowside *toside, size_t dlen,
bool no_udp_csum);
-int udp_sock_errs(const struct ctx *c, int s, uint32_t events);
+int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events);
#endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index 4123510..c26a223 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -227,7 +227,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int i;
- if (udp_sock_errs(c, ref.fd, events) < 0) {
+ if (udp_sock_errs(c, ref, events) < 0) {
err("UDP: Unrecoverable error on listening socket:"
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
return;
@@ -302,7 +302,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
ASSERT(!c->no_udp);
- if (udp_sock_errs(c, from_s, events) < 0) {
+ if (udp_sock_errs(c, ref, events) < 0) {
flow_err(uflow, "Unrecoverable error on reply socket");
flow_err_details(uflow);
udp_flow_close(c, uflow);
--
@@ -227,7 +227,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int i;
- if (udp_sock_errs(c, ref.fd, events) < 0) {
+ if (udp_sock_errs(c, ref, events) < 0) {
err("UDP: Unrecoverable error on listening socket:"
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
return;
@@ -302,7 +302,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
ASSERT(!c->no_udp);
- if (udp_sock_errs(c, from_s, events) < 0) {
+ if (udp_sock_errs(c, ref, events) < 0) {
flow_err(uflow, "Unrecoverable error on reply socket");
flow_err_details(uflow);
udp_flow_close(c, uflow);
--
2.48.0
^ permalink raw reply related [flat|nested] 8+ messages in thread
* Re: [PATCH] udp: create and send ICMPv4 to local peer when applicable
2025-02-09 17:00 [PATCH] udp: create and send ICMPv4 to local peer when applicable Jon Maloy
@ 2025-02-10 10:27 ` Stefano Brivio
2025-02-11 0:55 ` David Gibson
1 sibling, 0 replies; 8+ messages in thread
From: Stefano Brivio @ 2025-02-10 10:27 UTC (permalink / raw)
To: Jon Maloy; +Cc: passt-dev, lvivier, dgibson
On Sun, 9 Feb 2025 12:00:56 -0500
Jon Maloy <jmaloy@redhat.com> wrote:
> When a local peer sends a UDP message to a non-existing port on an
> existing remote host, that host will return an ICMP message containing
> the error code ICMP_PORT_UNREACH, plus the header and the first eight
> bytes of the original message. If the sender socket has been connected,
> it uses this message to issue a "Connection Refused" event to the user.
>
> Until now, we have only read such events from the externally facing
> socket, but we don't forward them back to the local sender because
> we cannot read the ICMP message directly to user space. Because of
> this, the local peer will hang and wait for a response that never
> arrives.
I haven't had a chance to really review this yet, in general it looks
great to me (I was afraid it would be more complicated).
I have a couple of preliminary questions though:
- referring to the paragraph above: what about TCP (which is the case
where a peer might actually hang)? Do you plan to support errors for
TCP's connect() in a separate patch?
> We now fix this for IPv4 by recreating and forwarding a correct ICMP
> message back to the internal sender. We synthesize the message based
> on the information in the extended error structure, plus the returned
> part of the original message body.
- ...and what about IPv6 and NDP? Also separate patch?
In that case, would it perhaps make sense to implement and submit that
as a series so that we have a consistent behaviour to begin with?
--
Stefano
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] udp: create and send ICMPv4 to local peer when applicable
2025-02-09 17:00 [PATCH] udp: create and send ICMPv4 to local peer when applicable Jon Maloy
2025-02-10 10:27 ` Stefano Brivio
@ 2025-02-11 0:55 ` David Gibson
2025-02-12 21:39 ` Jon Maloy
1 sibling, 1 reply; 8+ messages in thread
From: David Gibson @ 2025-02-11 0:55 UTC (permalink / raw)
To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson
[-- Attachment #1: Type: text/plain, Size: 11048 bytes --]
On Sun, Feb 09, 2025 at 12:00:56PM -0500, Jon Maloy wrote:
> When a local peer sends a UDP message to a non-existing port on an
> existing remote host, that host will return an ICMP message containing
> the error code ICMP_PORT_UNREACH, plus the header and the first eight
> bytes of the original message. If the sender socket has been connected,
> it uses this message to issue a "Connection Refused" event to the user.
>
> Until now, we have only read such events from the externally facing
> socket, but we don't forward them back to the local sender because
> we cannot read the ICMP message directly to user space. Because of
> this, the local peer will hang and wait for a response that never
> arrives.
>
> We now fix this for IPv4 by recreating and forwarding a correct ICMP
> message back to the internal sender. We synthesize the message based
> on the information in the extended error structure, plus the returned
> part of the original message body.
>
> Note that for the sake of completeness, we even produce ICMP messages
> for other error codes. We have noticed that at least ICMP_PROT_UNREACH
> is propagated as an error event back to the user.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
> tap.c | 4 +--
> tap.h | 2 ++
> udp.c | 71 ++++++++++++++++++++++++++++++++++++++++++--------
> udp_internal.h | 2 +-
> udp_vu.c | 4 +--
> 5 files changed, 67 insertions(+), 16 deletions(-)
>
> diff --git a/tap.c b/tap.c
> index cd32a90..f83aac5 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -142,8 +142,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
> *
> * Return: pointer at which to write the packet's payload
> */
> -static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
> - struct in_addr dst, size_t l4len, uint8_t proto)
> +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
> + struct in_addr dst, size_t l4len, uint8_t proto)
As discussed on our call, it probably makes sense to alter this to set
the Don't Fragment bit always.
> {
> uint16_t l3len = l4len + sizeof(*ip4h);
>
> diff --git a/tap.h b/tap.h
> index dfbd8b9..3ba00c1 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -44,6 +44,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
> thdr->vnet_len = htonl(l2len);
> }
>
> +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
> + struct in_addr dst, size_t l4len, uint8_t proto);
> void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
> struct in_addr dst, in_port_t dport,
> const void *in, size_t dlen);
> diff --git a/udp.c b/udp.c
> index 923cc38..4b800b6 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -87,6 +87,7 @@
> #include <netinet/in.h>
> #include <netinet/ip.h>
> #include <netinet/udp.h>
> +#include <netinet/ip_icmp.h>
> #include <stdint.h>
> #include <stddef.h>
> #include <string.h>
> @@ -402,25 +403,70 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
> (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
> }
>
> +/**
> + * udp_send_conn_fail_icmp4() - Construct and send ICMP to local peer
> + * @c: Execution context
> + * @ee: Extended error descriptor
> + * @ref: epoll reference
> + * @iov: First bytes (max 8) of original UDP message body
I don't love passing this as an iov, because that tends to imply there
might be multiple buffers, which is not the case here (both the caller
and callee require a single buffer).
> + */
> +static void udp_send_conn_fail_icmp4(const struct ctx *c,
> + const struct sock_extended_err *ee,
> + union epoll_ref ref,
> + struct iovec *iov)
> +{
> + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
This is subtly, but badly wrong. ref.flowside is only valid if the
ref is of a type which uses the flowside field. That's true for UDP
reply sockets, but *not* for UDP listening sockets, and
udp_sock_errs() is called on both. I think this function needs to
take an explicit flowside.
> + const struct flowside *toside = flowside_at_sidx(tosidx);
> + struct in_addr oaddr = toside->oaddr.v4mapped.a4;
> + struct in_addr eaddr = toside->eaddr.v4mapped.a4;
> + struct iov_tail data = IOV_TAIL(iov, 1, 0);
> + struct {
> + struct icmphdr icmp4h;
> + struct iphdr ip4h;
> + struct udphdr uh;
> + char data[8];
> + } msg;
Needs a ((packed)) attribute to ensure we don't get compiler padding.
It took me a minute to work out what was going on here. Specifically
that ip4h and uh here aren't the headers of the packet we're sending
now, but the reconstructed headers of the packet that prompted the
ICMP error.
> + size_t udplen = sizeof(msg.uh) + iov->iov_len;
> + size_t msglen = sizeof(msg.icmp4h) + sizeof(msg.ip4h) + udplen;
> +
> + memset(&msg, 0, sizeof(msg));
> + msg.icmp4h.type = ee->ee_type;
> + msg.icmp4h.code = ee->ee_code;
> + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, udplen, IPPROTO_UDP);
This isn't quite a natural fit. It does work, but the tap_push_()
functions are kind of designed to work with a byte buffer where we
only work out the positions of headers as we construct them. Probably
a clean up for some other day.
> + msg.uh.source = htons(toside->eport);
> + msg.uh.dest = htons(toside->oport);
> + msg.uh.len = htons(udplen);
> + csum_udp4(&msg.uh, oaddr, eaddr, &data);
It might make sense to split a tap_push_uh4() function out from
tap_udp4_send() which you could re-use here.
> + memcpy(&msg.data, iov->iov_base, iov->iov_len);
> + tap_icmp4_send(c, oaddr, eaddr, &msg, msglen);
> +}
> +
> /**
> * udp_sock_recverr() - Receive and clear an error from a socket
> - * @s: Socket to receive from
> + * @c: Execution context
> + * @ref: epoll reference
> *
> * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
> * if there was an error reading the queue
> *
> * #syscalls recvmsg
> */
> -static int udp_sock_recverr(int s)
> +static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
Similarly udp_sock_recverr() is used both on "listening" and reply
sockets, so it's not really safe to use the ref here. You can
explicitly pass the fd easily enough. The flow is trickier...
> {
> const struct sock_extended_err *ee;
> const struct cmsghdr *hdr;
> char buf[CMSG_SPACE(sizeof(*ee))];
> + char udp_data[8];
> + int s = ref.fd;
> + struct iovec iov = {
> + .iov_base = udp_data,
> + .iov_len = sizeof(udp_data)
> + };
> struct msghdr mh = {
> .msg_name = NULL,
> .msg_namelen = 0,
> - .msg_iov = NULL,
> - .msg_iovlen = 0,
> + .msg_iov = &iov,
> + .msg_iovlen = 1,
> .msg_control = buf,
> .msg_controllen = sizeof(buf),
> };
> @@ -450,8 +496,10 @@ static int udp_sock_recverr(int s)
> }
>
> ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
> -
> - /* TODO: When possible propagate and otherwise handle errors */
> + if (ee->ee_type == ICMP_DEST_UNREACH) {
> + iov.iov_len = rc;
> + udp_send_conn_fail_icmp4(c, ee, ref, &iov);
> + }
> debug("%s error on UDP socket %i: %s",
> str_ee_origin(ee), s, strerror_(ee->ee_errno));
>
> @@ -461,15 +509,16 @@ static int udp_sock_recverr(int s)
> /**
> * udp_sock_errs() - Process errors on a socket
> * @c: Execution context
> - * @s: Socket to receive from
> + * @ref: epoll reference
> * @events: epoll events bitmap
> *
> * Return: Number of errors handled, or < 0 if we have an unrecoverable error
> */
> -int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
> +int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events)
> {
> unsigned n_err = 0;
> socklen_t errlen;
> + int s = ref.fd;
> int rc, err;
>
> ASSERT(!c->no_udp);
> @@ -478,7 +527,7 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
> return 0; /* Nothing to do */
>
> /* Empty the error queue */
> - while ((rc = udp_sock_recverr(s)) > 0)
> + while ((rc = udp_sock_recverr(c, ref)) > 0)
> n_err += rc;
>
> if (rc < 0)
> @@ -558,7 +607,7 @@ static void udp_buf_listen_sock_handler(const struct ctx *c,
> const socklen_t sasize = sizeof(udp_meta[0].s_in);
> int n, i;
>
> - if (udp_sock_errs(c, ref.fd, events) < 0) {
> + if (udp_sock_errs(c, ref, events) < 0) {
... because in the listening socket case we don't know the flow until
we've looked at the packet. I'd be tempted as a first cut to only do
the ICMP thing for errors on reply sockets.
Listening sockets will be trickier. I think we'll have to actually
check msg_name in the received error in order to work out which flow
it belongs to.
> err("UDP: Unrecoverable error on listening socket:"
> " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
> /* FIXME: what now? close/re-open socket? */
> @@ -661,7 +710,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
>
> from_s = uflow->s[ref.flowside.sidei];
>
> - if (udp_sock_errs(c, from_s, events) < 0) {
> + if (udp_sock_errs(c, ref, events) < 0) {
> flow_err(uflow, "Unrecoverable error on reply socket");
> flow_err_details(uflow);
> udp_flow_close(c, uflow);
> diff --git a/udp_internal.h b/udp_internal.h
> index cc80e30..3b081f5 100644
> --- a/udp_internal.h
> +++ b/udp_internal.h
> @@ -30,5 +30,5 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
> size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
> const struct flowside *toside, size_t dlen,
> bool no_udp_csum);
> -int udp_sock_errs(const struct ctx *c, int s, uint32_t events);
> +int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events);
> #endif /* UDP_INTERNAL_H */
> diff --git a/udp_vu.c b/udp_vu.c
> index 4123510..c26a223 100644
> --- a/udp_vu.c
> +++ b/udp_vu.c
> @@ -227,7 +227,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
> struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
> int i;
>
> - if (udp_sock_errs(c, ref.fd, events) < 0) {
> + if (udp_sock_errs(c, ref, events) < 0) {
> err("UDP: Unrecoverable error on listening socket:"
> " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
> return;
> @@ -302,7 +302,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
>
> ASSERT(!c->no_udp);
>
> - if (udp_sock_errs(c, from_s, events) < 0) {
> + if (udp_sock_errs(c, ref, events) < 0) {
> flow_err(uflow, "Unrecoverable error on reply socket");
> flow_err_details(uflow);
> udp_flow_close(c, uflow);
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] udp: create and send ICMPv4 to local peer when applicable
2025-02-11 0:55 ` David Gibson
@ 2025-02-12 21:39 ` Jon Maloy
2025-02-12 23:21 ` David Gibson
2025-02-13 19:07 ` Jon Maloy
0 siblings, 2 replies; 8+ messages in thread
From: Jon Maloy @ 2025-02-12 21:39 UTC (permalink / raw)
To: David Gibson; +Cc: passt-dev, sbrivio, lvivier, dgibson
On 2025-02-10 19:55, David Gibson wrote:
> On Sun, Feb 09, 2025 at 12:00:56PM -0500, Jon Maloy wrote:
>> When a local peer sends a UDP message to a non-existing port on an
>> existing remote host, that host will return an ICMP message containing
>> the error code ICMP_PORT_UNREACH, plus the header and the first eight
>> bytes of the original message. If the sender socket has been connected,
>> it uses this message to issue a "Connection Refused" event to the user.
[...]
>> +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
>> + struct in_addr dst, size_t l4len, uint8_t proto)
>
> As discussed on our call, it probably makes sense to alter this to set
> the Don't Fragment bit always.
I have been studying the Linux code, but this is handled in so many
places that it it hard to get a full grip on it. Wireshark logs seem
to indicate that it is always set, and googling a bit on the subject
indicates the same. I think it is safe to do it, so if you agree I
can post a prepending patch setting this.
>
>> {
>> uint16_t l3len = l4len + sizeof(*ip4h);
>>
>> diff --git a/tap.h b/tap.h
>> index dfbd8b9..3ba00c1 100644
>> --- a/tap.h
>> +++ b/tap.h
>> @@ -44,6 +44,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
>> thdr->vnet_len = htonl(l2len);
>> }
>>
[...]
>> + * udp_send_conn_fail_icmp4() - Construct and send ICMP to local peer
>> + * @c: Execution context
>> + * @ee: Extended error descriptor
>> + * @ref: epoll reference
>> + * @iov: First bytes (max 8) of original UDP message body
>
> I don't love passing this as an iov, because that tends to imply there
> might be multiple buffers, which is not the case here (both the caller
> and callee require a single buffer).
The real reason is that csum_udp4() requires an iov, so if we don't
pass it here we will have to recreate it inside the new function.
>
>> + */
>> +static void udp_send_conn_fail_icmp4(const struct ctx *c,
>> + const struct sock_extended_err *ee,
>> + union epoll_ref ref,
>> + struct iovec *iov)
>
>
>
>> +{
>> + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
>
> This is subtly, but badly wrong. ref.flowside is only valid if the
> ref is of a type which uses the flowside field. That's true for UDP
> reply sockets, but *not* for UDP listening sockets, and
> udp_sock_errs() is called on both. I think this function needs to
> take an explicit flowside.
Yes, I was a little uncertain about this. Can you give me small code
snippet of how this should be done?
>
>> + const struct flowside *toside = flowside_at_sidx(tosidx);
>> + struct in_addr oaddr = toside->oaddr.v4mapped.a4;
>> + struct in_addr eaddr = toside->eaddr.v4mapped.a4;
>> + struct iov_tail data = IOV_TAIL(iov, 1, 0);
>> + struct {
>> + struct icmphdr icmp4h;
>> + struct iphdr ip4h;
>> + struct udphdr uh;
>> + char data[8];
>> + } msg;
>
> Needs a ((packed)) attribute to ensure we don't get compiler padding.
>
ok.
> It took me a minute to work out what was going on here. Specifically
> that ip4h and uh here aren't the headers of the packet we're sending
> now, but the reconstructed headers of the packet that prompted the
> ICMP error.
Yes, I try to re-create the original message as closely as possible.
>
>> + size_t udplen = sizeof(msg.uh) + iov->iov_len;
>> + size_t msglen = sizeof(msg.icmp4h) + sizeof(msg.ip4h) + udplen;
>> +
>> + memset(&msg, 0, sizeof(msg));
>> + msg.icmp4h.type = ee->ee_type;
>> + msg.icmp4h.code = ee->ee_code;
>> + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, udplen, IPPROTO_UDP);
>
> This isn't quite a natural fit. It does work, but the tap_push_()
> functions are kind of designed to work with a byte buffer where we
> only work out the positions of headers as we construct them. Probably
> a clean up for some other day.
>
>> + msg.uh.source = htons(toside->eport);
>> + msg.uh.dest = htons(toside->oport);
>> + msg.uh.len = htons(udplen);
>> + csum_udp4(&msg.uh, oaddr, eaddr, &data);
>
> It might make sense to split a tap_push_uh4() function out from
> tap_udp4_send() which you could re-use here.
Good point.
>
>> + memcpy(&msg.data, iov->iov_base, iov->iov_len);
>> + tap_icmp4_send(c, oaddr, eaddr, &msg, msglen);
>> +}
>> +
>> /**
>> * udp_sock_recverr() - Receive and clear an error from a socket
>> - * @s: Socket to receive from
>> + * @c: Execution context
>> + * @ref: epoll reference
>> *
>> * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
>> * if there was an error reading the queue
>> *
>> * #syscalls recvmsg
>> */
>> -static int udp_sock_recverr(int s)
>> +static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
>
> Similarly udp_sock_recverr() is used both on "listening" and reply
> sockets, so it's not really safe to use the ref here. You can
> explicitly pass the fd easily enough. The flow is trickier...
I can pass *both* the fd and ref, alternativedly the flowside, but
it looks redundant and adds extra code.
If I add a flowside struct instead of ref I will need to declare
that at the all the multiple locations where udp_sock_errs().
Do you have a good suggestion?
>
>> {
>> const struct sock_extended_err *ee;
>> const struct cmsghdr *hdr;
>> char buf[CMSG_SPACE(sizeof(*ee))];
>> + char udp_data[8];
>> + int s = ref.fd;
>> + struct iovec iov = {
>> + .iov_base = udp_data,
>> + .iov_len = sizeof(udp_data)
[...]
>> if (rc < 0)
>> @@ -558,7 +607,7 @@ static void udp_buf_listen_sock_handler(const struct ctx *c,
>> const socklen_t sasize = sizeof(udp_meta[0].s_in);
>> int n, i;
>>
>> - if (udp_sock_errs(c, ref.fd, events) < 0) {
>> + if (udp_sock_errs(c, ref, events) < 0) {
>
> ... because in the listening socket case we don't know the flow until
> we've looked at the packet. I'd be tempted as a first cut to only do
> the ICMP thing for errors on reply sockets.
I can try that, but it would feel a little reduced.
>
> Listening sockets will be trickier. I think we'll have to actually
> check msg_name in the received error in order to work out which flow
> it belongs to.
Actually, I did that first, until I realized I could extract the source
IP address from the flow object. But how does that help?
///jon
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] udp: create and send ICMPv4 to local peer when applicable
2025-02-12 21:39 ` Jon Maloy
@ 2025-02-12 23:21 ` David Gibson
2025-02-13 19:07 ` Jon Maloy
1 sibling, 0 replies; 8+ messages in thread
From: David Gibson @ 2025-02-12 23:21 UTC (permalink / raw)
To: Jon Maloy; +Cc: passt-dev, sbrivio, lvivier, dgibson
[-- Attachment #1: Type: text/plain, Size: 8526 bytes --]
On Wed, Feb 12, 2025 at 04:39:07PM -0500, Jon Maloy wrote:
> On 2025-02-10 19:55, David Gibson wrote:
> > On Sun, Feb 09, 2025 at 12:00:56PM -0500, Jon Maloy wrote:
> > > When a local peer sends a UDP message to a non-existing port on an
> > > existing remote host, that host will return an ICMP message containing
> > > the error code ICMP_PORT_UNREACH, plus the header and the first eight
> > > bytes of the original message. If the sender socket has been connected,
> > > it uses this message to issue a "Connection Refused" event to the user.
>
> [...]
>
> > > +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
> > > + struct in_addr dst, size_t l4len, uint8_t proto)
> >
> > As discussed on our call, it probably makes sense to alter this to set
> > the Don't Fragment bit always.
>
> I have been studying the Linux code, but this is handled in so many places
> that it it hard to get a full grip on it. Wireshark logs seem
> to indicate that it is always set, and googling a bit on the subject
> indicates the same. I think it is safe to do it, so if you agree I
> can post a prepending patch setting this.
I'm not 100% confident 100% pedantically correct. I'm pretty
confident it won't cause problems though.
>
> >
> > > {
> > > uint16_t l3len = l4len + sizeof(*ip4h);
> > > diff --git a/tap.h b/tap.h
> > > index dfbd8b9..3ba00c1 100644
> > > --- a/tap.h
> > > +++ b/tap.h
> > > @@ -44,6 +44,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
> > > thdr->vnet_len = htonl(l2len);
> > > }
>
> [...]
> > > + * udp_send_conn_fail_icmp4() - Construct and send ICMP to local peer
> > > + * @c: Execution context
> > > + * @ee: Extended error descriptor
> > > + * @ref: epoll reference
> > > + * @iov: First bytes (max 8) of original UDP message body
> >
> > I don't love passing this as an iov, because that tends to imply there
> > might be multiple buffers, which is not the case here (both the caller
> > and callee require a single buffer).
>
> The real reason is that csum_udp4() requires an iov, so if we don't
> pass it here we will have to recreate it inside the new function.
Yeah, I guess so. Ok, I suppose it makes more sense than not here.
> > > + */
> > > +static void udp_send_conn_fail_icmp4(const struct ctx *c,
> > > + const struct sock_extended_err *ee,
> > > + union epoll_ref ref,
> > > + struct iovec *iov)
> >
> >
> >
> > > +{
> > > + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
> >
> > This is subtly, but badly wrong. ref.flowside is only valid if the
> > ref is of a type which uses the flowside field. That's true for UDP
> > reply sockets, but *not* for UDP listening sockets, and
> > udp_sock_errs() is called on both. I think this function needs to
> > take an explicit flowside.
>
> Yes, I was a little uncertain about this. Can you give me small code
> snippet of how this should be done?
Small code for what example? For finding the flow associated with a
listening socket, look at udp_flow_from_sock(), specifically the part
before flow_alloc(). I think we'll need to either duplicate or split
that out into a helper to use for the error path.
I'm pretty sure the flow lookup needs to be done in the caller, not
here, though, so you'll want to pass it.
> > > + const struct flowside *toside = flowside_at_sidx(tosidx);
> > > + struct in_addr oaddr = toside->oaddr.v4mapped.a4;
> > > + struct in_addr eaddr = toside->eaddr.v4mapped.a4;
> > > + struct iov_tail data = IOV_TAIL(iov, 1, 0);
> > > + struct {
> > > + struct icmphdr icmp4h;
> > > + struct iphdr ip4h;
> > > + struct udphdr uh;
> > > + char data[8];
> > > + } msg;
> >
> > Needs a ((packed)) attribute to ensure we don't get compiler padding.
> >
> ok.
> > It took me a minute to work out what was going on here. Specifically
> > that ip4h and uh here aren't the headers of the packet we're sending
> > now, but the reconstructed headers of the packet that prompted the
> > ICMP error.
>
> Yes, I try to re-create the original message as closely as possible.
Right, and correctly so, but a comment might make it easier to make
that leap for someone new looking at it.
> > > + size_t udplen = sizeof(msg.uh) + iov->iov_len;
> > > + size_t msglen = sizeof(msg.icmp4h) + sizeof(msg.ip4h) + udplen;
> > > +
> > > + memset(&msg, 0, sizeof(msg));
> > > + msg.icmp4h.type = ee->ee_type;
> > > + msg.icmp4h.code = ee->ee_code;
> > > + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, udplen, IPPROTO_UDP);
> >
> > This isn't quite a natural fit. It does work, but the tap_push_()
> > functions are kind of designed to work with a byte buffer where we
> > only work out the positions of headers as we construct them. Probably
> > a clean up for some other day.
> >
> > > + msg.uh.source = htons(toside->eport);
> > > + msg.uh.dest = htons(toside->oport);
> > > + msg.uh.len = htons(udplen);
> > > + csum_udp4(&msg.uh, oaddr, eaddr, &data);
> >
> > It might make sense to split a tap_push_uh4() function out from
> > tap_udp4_send() which you could re-use here.
>
> Good point.
> >
> > > + memcpy(&msg.data, iov->iov_base, iov->iov_len);
> > > + tap_icmp4_send(c, oaddr, eaddr, &msg, msglen);
> > > +}
> > > +
> > > /**
> > > * udp_sock_recverr() - Receive and clear an error from a socket
> > > - * @s: Socket to receive from
> > > + * @c: Execution context
> > > + * @ref: epoll reference
> > > *
> > > * Return: 1 if error received and processed, 0 if no more errors in queue, < 0
> > > * if there was an error reading the queue
> > > *
> > > * #syscalls recvmsg
> > > */
> > > -static int udp_sock_recverr(int s)
> > > +static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
> >
> > Similarly udp_sock_recverr() is used both on "listening" and reply
> > sockets, so it's not really safe to use the ref here. You can
> > explicitly pass the fd easily enough. The flow is trickier...
>
> I can pass *both* the fd and ref, alternativedly the flowside, but
> it looks redundant and adds extra code.
> If I add a flowside struct instead of ref I will need to declare
> that at the all the multiple locations where udp_sock_errs().
> Do you have a good suggestion?
Sorry, I didn't think that through all the way. You will want to pass
the ref here, but you'll need to examine the ref type to work out how
to determine the flow. For EPOLL_TYPE_UDP_REPLY you can just look at
ref.flowside. However for EPOLL_TYPE_UDP_LISTEN you'll need to read
the error first, then look up the flow like udp_flow_from_sock() (but
not allowing new flows to be created).
> > > {
> > > const struct sock_extended_err *ee;
> > > const struct cmsghdr *hdr;
> > > char buf[CMSG_SPACE(sizeof(*ee))];
> > > + char udp_data[8];
> > > + int s = ref.fd;
> > > + struct iovec iov = {
> > > + .iov_base = udp_data,
> > > + .iov_len = sizeof(udp_data)
>
> [...]
>
> > > if (rc < 0)
> > > @@ -558,7 +607,7 @@ static void udp_buf_listen_sock_handler(const struct ctx *c,
> > > const socklen_t sasize = sizeof(udp_meta[0].s_in);
> > > int n, i;
> > > - if (udp_sock_errs(c, ref.fd, events) < 0) {
> > > + if (udp_sock_errs(c, ref, events) < 0) {
> >
> > ... because in the listening socket case we don't know the flow until
> > we've looked at the packet. I'd be tempted as a first cut to only do
> > the ICMP thing for errors on reply sockets.
>
> I can try that, but it would feel a little reduced.
It would be, but there's nothing wrong with doing one thing at a time.
> > Listening sockets will be trickier. I think we'll have to actually
> > check msg_name in the received error in order to work out which flow
> > it belongs to.
>
> Actually, I did that first, until I realized I could extract the source
> IP address from the flow object. But how does that help?
That only works if you already know the right flow. You do for
EPOLL_TYPE_UDP_REPLY, but "listening" sockets (EPOLL_TYPE_UDP_LISTEN)
handle multiple flows each so it's impossible to know the flow from
the ref alone. You'll have to examine msg_name to figure it out.
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] udp: create and send ICMPv4 to local peer when applicable
2025-02-12 21:39 ` Jon Maloy
2025-02-12 23:21 ` David Gibson
@ 2025-02-13 19:07 ` Jon Maloy
2025-02-13 19:17 ` Stefano Brivio
1 sibling, 1 reply; 8+ messages in thread
From: Jon Maloy @ 2025-02-13 19:07 UTC (permalink / raw)
To: David Gibson; +Cc: passt-dev, sbrivio, lvivier, dgibson
On 2025-02-12 16:39, Jon Maloy wrote:
>
>
> On 2025-02-10 19:55, David Gibson wrote:
>> On Sun, Feb 09, 2025 at 12:00:56PM -0500, Jon Maloy wrote:
>>> When a local peer sends a UDP message to a non-existing port on an
>>> existing remote host, that host will return an ICMP message containing
>>> the error code ICMP_PORT_UNREACH, plus the header and the first eight
>>> bytes of the original message. If the sender socket has been connected,
>>> it uses this message to issue a "Connection Refused" event to the user.
>
> [...]
>
>>> +void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
>>> + struct in_addr dst, size_t l4len, uint8_t proto)
>>
>> As discussed on our call, it probably makes sense to alter this to set
>> the Don't Fragment bit always.
>
> I have been studying the Linux code, but this is handled in so many
> places that it it hard to get a full grip on it. Wireshark logs seem
> to indicate that it is always set, and googling a bit on the subject
> indicates the same. I think it is safe to do it, so if you agree I
> can post a prepending patch setting this.
>
>>
>>> {
>>> uint16_t l3len = l4len + sizeof(*ip4h);
>>> diff --git a/tap.h b/tap.h
>>> index dfbd8b9..3ba00c1 100644
>>> --- a/tap.h
>>> +++ b/tap.h
>>> @@ -44,6 +44,8 @@ static inline void tap_hdr_update(struct tap_hdr
>>> *thdr, size_t l2len)
>>> thdr->vnet_len = htonl(l2len);
>>> }
>
> [...]
>>> + * udp_send_conn_fail_icmp4() - Construct and send ICMP to local peer
>>> + * @c: Execution context
>>> + * @ee: Extended error descriptor
>>> + * @ref: epoll reference
>>> + * @iov: First bytes (max 8) of original UDP message body
>>
>> I don't love passing this as an iov, because that tends to imply there
>> might be multiple buffers, which is not the case here (both the caller
>> and callee require a single buffer).
>
> The real reason is that csum_udp4() requires an iov, so if we don't
> pass it here we will have to recreate it inside the new function.
>
>>
>>> + */
>>> +static void udp_send_conn_fail_icmp4(const struct ctx *c,
>>> + const struct sock_extended_err *ee,
>>> + union epoll_ref ref,
>>> + struct iovec *iov)
>>
>>
>>
>>> +{
>>> + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside);
>>
>> This is subtly, but badly wrong. ref.flowside is only valid if the
>> ref is of a type which uses the flowside field. That's true for UDP
>> reply sockets, but *not* for UDP listening sockets, and
>> udp_sock_errs() is called on both. I think this function needs to
>> take an explicit flowside.
>
> Yes, I was a little uncertain about this. Can you give me small code
> snippet of how this should be done?
>
>>
>>> + const struct flowside *toside = flowside_at_sidx(tosidx);
>>> + struct in_addr oaddr = toside->oaddr.v4mapped.a4;
>>> + struct in_addr eaddr = toside->eaddr.v4mapped.a4;
>>> + struct iov_tail data = IOV_TAIL(iov, 1, 0);
>>> + struct {
>>> + struct icmphdr icmp4h;
>>> + struct iphdr ip4h;
>>> + struct udphdr uh;
>>> + char data[8];
>>> + } msg;
>>
>> Needs a ((packed)) attribute to ensure we don't get compiler padding.
I get
udp.c:437:23: warning: taking address of packed member of ‘struct
<anonymous>’ may result in an unaligned pointer value
[-Waddress-of-packed-member]
437 | tap_push_ip4h(&msg.ip4h, eaddr, oaddr, udplen,
IPPROTO_UDP);
Also, the sizes of these members are 8+20+8+8, so I cannot see how
this struct can possibly be packed or how there can be an unaligned
pointer.
In short: ((packed)) seems unnecessary, and only causes problems.
/jon
>>
> ok.
>> It took me a minute to work out what was going on here. Specifically
>> that ip4h and uh here aren't the headers of the packet we're sending
>> now, but the reconstructed headers of the packet that prompted the
>> ICMP error.
>
> Yes, I try to re-create the original message as closely as possible.
>
>>
>>> + size_t udplen = sizeof(msg.uh) + iov->iov_len;
>>> + size_t msglen = sizeof(msg.icmp4h) + sizeof(msg.ip4h) + udplen;
>>> +
>>> + memset(&msg, 0, sizeof(msg));
>>> + msg.icmp4h.type = ee->ee_type;
>>> + msg.icmp4h.code = ee->ee_code;
>>> + tap_push_ip4h(&msg.ip4h, eaddr, oaddr, udplen, IPPROTO_UDP);
>>
>> This isn't quite a natural fit. It does work, but the tap_push_()
>> functions are kind of designed to work with a byte buffer where we
>> only work out the positions of headers as we construct them. Probably
>> a clean up for some other day.
>>
>>> + msg.uh.source = htons(toside->eport);
>>> + msg.uh.dest = htons(toside->oport);
>>> + msg.uh.len = htons(udplen);
>>> + csum_udp4(&msg.uh, oaddr, eaddr, &data);
>>
>> It might make sense to split a tap_push_uh4() function out from
>> tap_udp4_send() which you could re-use here.
>
> Good point.
>>
>>> + memcpy(&msg.data, iov->iov_base, iov->iov_len);
>>> + tap_icmp4_send(c, oaddr, eaddr, &msg, msglen);
>>> +}
>>> +
>>> /**
>>> * udp_sock_recverr() - Receive and clear an error from a socket
>>> - * @s: Socket to receive from
>>> + * @c: Execution context
>>> + * @ref: epoll reference
>>> *
>>> * Return: 1 if error received and processed, 0 if no more errors
>>> in queue, < 0
>>> * if there was an error reading the queue
>>> *
>>> * #syscalls recvmsg
>>> */
>>> -static int udp_sock_recverr(int s)
>>> +static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref)
>>
>> Similarly udp_sock_recverr() is used both on "listening" and reply
>> sockets, so it's not really safe to use the ref here. You can
>> explicitly pass the fd easily enough. The flow is trickier...
>
> I can pass *both* the fd and ref, alternativedly the flowside, but
> it looks redundant and adds extra code.
> If I add a flowside struct instead of ref I will need to declare
> that at the all the multiple locations where udp_sock_errs().
> Do you have a good suggestion?
>
>>
>>> {
>>> const struct sock_extended_err *ee;
>>> const struct cmsghdr *hdr;
>>> char buf[CMSG_SPACE(sizeof(*ee))];
>>> + char udp_data[8];
>>> + int s = ref.fd;
>>> + struct iovec iov = {
>>> + .iov_base = udp_data,
>>> + .iov_len = sizeof(udp_data)
>
> [...]
>
>>> if (rc < 0)
>>> @@ -558,7 +607,7 @@ static void udp_buf_listen_sock_handler(const
>>> struct ctx *c,
>>> const socklen_t sasize = sizeof(udp_meta[0].s_in);
>>> int n, i;
>>> - if (udp_sock_errs(c, ref.fd, events) < 0) {
>>> + if (udp_sock_errs(c, ref, events) < 0) {
>>
>> ... because in the listening socket case we don't know the flow until
>> we've looked at the packet. I'd be tempted as a first cut to only do
>> the ICMP thing for errors on reply sockets.
>
> I can try that, but it would feel a little reduced.
>
>>
>> Listening sockets will be trickier. I think we'll have to actually
>> check msg_name in the received error in order to work out which flow
>> it belongs to.
>
> Actually, I did that first, until I realized I could extract the source
> IP address from the flow object. But how does that help?
>
> ///jon
>
>
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] udp: create and send ICMPv4 to local peer when applicable
2025-02-13 19:07 ` Jon Maloy
@ 2025-02-13 19:17 ` Stefano Brivio
2025-02-13 19:28 ` Jon Maloy
0 siblings, 1 reply; 8+ messages in thread
From: Stefano Brivio @ 2025-02-13 19:17 UTC (permalink / raw)
To: Jon Maloy; +Cc: David Gibson, passt-dev, lvivier, dgibson
On Thu, 13 Feb 2025 14:07:22 -0500
Jon Maloy <jmaloy@redhat.com> wrote:
> On 2025-02-12 16:39, Jon Maloy wrote:
> > On 2025-02-10 19:55, David Gibson wrote:
> >> On Sun, Feb 09, 2025 at 12:00:56PM -0500, Jon Maloy wrote:
> >>> + const struct flowside *toside = flowside_at_sidx(tosidx);
> >>> + struct in_addr oaddr = toside->oaddr.v4mapped.a4;
> >>> + struct in_addr eaddr = toside->eaddr.v4mapped.a4;
> >>> + struct iov_tail data = IOV_TAIL(iov, 1, 0);
> >>> + struct {
> >>> + struct icmphdr icmp4h;
> >>> + struct iphdr ip4h;
> >>> + struct udphdr uh;
> >>> + char data[8];
> >>> + } msg;
> >>
> >> Needs a ((packed)) attribute to ensure we don't get compiler padding.
>
> I get
> udp.c:437:23: warning: taking address of packed member of ‘struct
> <anonymous>’ may result in an unaligned pointer value
> [-Waddress-of-packed-member]
> 437 | tap_push_ip4h(&msg.ip4h, eaddr, oaddr, udplen,
> IPPROTO_UDP);
>
> Also, the sizes of these members are 8+20+8+8, so I cannot see how
> this struct can possibly be packed or how there can be an unaligned
> pointer.
Unaligned pointer because it's on the stack, and it can happily be at a
semi-random point of it. Packed needs to be packed because you could
have 8 + 20 + ("oops we are at 28 let's make it 32") 4 + 8 + 8.
> In short: ((packed)) seems unnecessary, and only causes problems.
...well, but it still should be ((packed)) because it goes on the wire,
and ((aligned)) because you need to access the single fields:
__attribute__((packed, aligned(__alignof__(max_align_t))));
meaning: align it to whatever any kind of machine might possibly need.
--
Stefano
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH] udp: create and send ICMPv4 to local peer when applicable
2025-02-13 19:17 ` Stefano Brivio
@ 2025-02-13 19:28 ` Jon Maloy
0 siblings, 0 replies; 8+ messages in thread
From: Jon Maloy @ 2025-02-13 19:28 UTC (permalink / raw)
To: Stefano Brivio; +Cc: David Gibson, passt-dev, lvivier, dgibson
On 2025-02-13 14:17, Stefano Brivio wrote:
> On Thu, 13 Feb 2025 14:07:22 -0500
> Jon Maloy <jmaloy@redhat.com> wrote:
>
>> On 2025-02-12 16:39, Jon Maloy wrote:
>>> On 2025-02-10 19:55, David Gibson wrote:
>>>> On Sun, Feb 09, 2025 at 12:00:56PM -0500, Jon Maloy wrote:
>>>>> + const struct flowside *toside = flowside_at_sidx(tosidx);
>>>>> + struct in_addr oaddr = toside->oaddr.v4mapped.a4;
>>>>> + struct in_addr eaddr = toside->eaddr.v4mapped.a4;
>>>>> + struct iov_tail data = IOV_TAIL(iov, 1, 0);
>>>>> + struct {
>>>>> + struct icmphdr icmp4h;
>>>>> + struct iphdr ip4h;
>>>>> + struct udphdr uh;
>>>>> + char data[8];
>>>>> + } msg;
>>>>
>>>> Needs a ((packed)) attribute to ensure we don't get compiler padding.
>>
>> I get
>> udp.c:437:23: warning: taking address of packed member of ‘struct
>> <anonymous>’ may result in an unaligned pointer value
>> [-Waddress-of-packed-member]
>> 437 | tap_push_ip4h(&msg.ip4h, eaddr, oaddr, udplen,
>> IPPROTO_UDP);
>>
>> Also, the sizes of these members are 8+20+8+8, so I cannot see how
>> this struct can possibly be packed or how there can be an unaligned
>> pointer.
>
> Unaligned pointer because it's on the stack, and it can happily be at a
> semi-random point of it. Packed needs to be packed because you could
> have 8 + 20 + ("oops we are at 28 let's make it 32") 4 + 8 + 8.
>
>> In short: ((packed)) seems unnecessary, and only causes problems.
>
> ...well, but it still should be ((packed)) because it goes on the wire,
> and ((aligned)) because you need to access the single fields:
>
> __attribute__((packed, aligned(__alignof__(max_align_t))));
>
> meaning: align it to whatever any kind of machine might possibly need.
>
Ok. That worked. Thanks.
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2025-02-13 19:28 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-02-09 17:00 [PATCH] udp: create and send ICMPv4 to local peer when applicable Jon Maloy
2025-02-10 10:27 ` Stefano Brivio
2025-02-11 0:55 ` David Gibson
2025-02-12 21:39 ` Jon Maloy
2025-02-12 23:21 ` David Gibson
2025-02-13 19:07 ` Jon Maloy
2025-02-13 19:17 ` Stefano Brivio
2025-02-13 19:28 ` Jon Maloy
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).