From: Jon Maloy <jmaloy@redhat.com>
To: passt-dev@passt.top, sbrivio@redhat.com, lvivier@redhat.com,
dgibson@redhat.com, jmaloy@redhat.com
Subject: [PATCH v3 2/2] udp: create and send ICMPv4 to local peer when applicable
Date: Wed, 19 Feb 2025 14:30:07 -0500 [thread overview]
Message-ID: <20250219193007.2336670-3-jmaloy@redhat.com> (raw)
In-Reply-To: <20250219193007.2336670-1-jmaloy@redhat.com>
When a local peer sends a UDP message to a non-existing port on an
existing remote host, that host will return an ICMP message containing
the error code ICMP_PORT_UNREACH, plus the header and the first eight
bytes of the original message. If the sender socket has been connected,
it uses this message to issue a "Connection Refused" event to the user.
Until now, we have only read such events from the externally facing
socket, but we don't forward them back to the local sender because
we cannot read the ICMP message directly to user space. Because of
this, the local peer will hang and wait for a response that never
arrives.
We now fix this for IPv4 by recreating and forwarding a correct ICMP
message back to the internal sender. We synthesize the message based
on the information in the extended error structure, plus the returned
part of the original message body.
Note that for the sake of completeness, we even produce ICMP messages
for other error codes. We have noticed that at least ICMP_PROT_UNREACH
is propagated as an error event back to the user.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v2: - Updated the ICMP creation to use the new function tap_push_uh4().
- Added logics to find correct flow, depending on origin.
- All done after feedback from David Gibson.
v3: - Passing parameter 'now' along with call to udp_sock_errs() call.
- Corrected lookup of flow from listener socket
- All done after feedback from David Gibson.
---
tap.c | 4 +--
tap.h | 3 ++
udp.c | 96 +++++++++++++++++++++++++++++++++++++++++++-------
udp_internal.h | 3 +-
udp_vu.c | 4 +--
5 files changed, 92 insertions(+), 18 deletions(-)
diff --git a/tap.c b/tap.c
index 95d64bf..902f076 100644
--- a/tap.c
+++ b/tap.c
@@ -142,8 +142,8 @@ static void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
*
* Return: pointer at which to write the packet's payload
*/
-static void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
- struct in_addr dst, size_t l4len, uint8_t proto)
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+ struct in_addr dst, size_t l4len, uint8_t proto)
{
uint16_t l3len = l4len + sizeof(*ip4h);
diff --git a/tap.h b/tap.h
index 3451343..5b326f5 100644
--- a/tap.h
+++ b/tap.h
@@ -45,9 +45,12 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
if (thdr)
thdr->vnet_len = htonl(l2len);
}
+
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
struct in_addr dst, in_port_t dport,
const void *in, size_t dlen);
+void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
+ struct in_addr dst, size_t l4len, uint8_t proto);
void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
struct in_addr dst, in_port_t dport,
const void *in, size_t dlen);
diff --git a/udp.c b/udp.c
index 923cc38..b9c53eb 100644
--- a/udp.c
+++ b/udp.c
@@ -87,6 +87,7 @@
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/udp.h>
+#include <netinet/ip_icmp.h>
#include <stdint.h>
#include <stddef.h>
#include <string.h>
@@ -402,25 +403,72 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
}
+/**
+ * udp_send_conn_fail_icmp4() - Construct and send ICMP to local peer
+ * @c: Execution context
+ * @ee: Extended error descriptor
+ * @ref: epoll reference
+ * @in: First bytes (max 8) of original UDP message body
+ * @dlen: Length of the read part of original UDP message body
+ */
+static void udp_send_conn_fail_icmp4(const struct ctx *c,
+ const struct sock_extended_err *ee,
+ const struct flowside *toside,
+ void *in, size_t dlen)
+{
+ struct in_addr oaddr = toside->oaddr.v4mapped.a4;
+ struct in_addr eaddr = toside->eaddr.v4mapped.a4;
+ in_port_t eport = toside->eport;
+ in_port_t oport = toside->oport;
+ struct {
+ struct icmphdr icmp4h;
+ struct iphdr ip4h;
+ struct udphdr uh;
+ char data[8];
+ } __attribute__((packed, aligned(__alignof__(max_align_t)))) msg;
+ size_t msglen = sizeof(msg) - sizeof(msg.data) + dlen;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.icmp4h.type = ee->ee_type;
+ msg.icmp4h.code = ee->ee_code;
+
+ /* Reconstruct the original headers as returned in the ICMP message */
+ tap_push_ip4h(&msg.ip4h, eaddr, oaddr, dlen, IPPROTO_UDP);
+ tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
+ memcpy(&msg.data, in, dlen);
+
+ tap_icmp4_send(c, oaddr, eaddr, &msg, msglen);
+}
+
/**
* udp_sock_recverr() - Receive and clear an error from a socket
- * @s: Socket to receive from
+ * @c: Execution context
+ * @ref: epoll reference
+ * @now: Current timestamp
*
* Return: 1 if error received and processed, 0 if no more errors in queue, < 0
* if there was an error reading the queue
*
* #syscalls recvmsg
*/
-static int udp_sock_recverr(int s)
+static int udp_sock_recverr(const struct ctx *c, union epoll_ref ref,
+ const struct timespec *now)
{
const struct sock_extended_err *ee;
const struct cmsghdr *hdr;
+ union sockaddr_inany saddr;
char buf[CMSG_SPACE(sizeof(*ee))];
+ char udp_data[8];
+ int s = ref.fd;
+ struct iovec iov = {
+ .iov_base = udp_data,
+ .iov_len = sizeof(udp_data)
+ };
struct msghdr mh = {
- .msg_name = NULL,
- .msg_namelen = 0,
- .msg_iov = NULL,
- .msg_iovlen = 0,
+ .msg_name = &saddr,
+ .msg_namelen = sizeof(saddr),
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
.msg_control = buf,
.msg_controllen = sizeof(buf),
};
@@ -450,8 +498,27 @@ static int udp_sock_recverr(int s)
}
ee = (const struct sock_extended_err *)CMSG_DATA(hdr);
-
- /* TODO: When possible propagate and otherwise handle errors */
+ if (ee->ee_type == ICMP_DEST_UNREACH) {
+ flow_sidx_t sidx;
+ struct udp_flow *flow;
+ const struct flowside *toside;
+
+ if (ref.type == EPOLL_TYPE_UDP_LISTEN) {
+ sidx = flow_lookup_sa(c, IPPROTO_UDP, ref.udp.pif,
+ &saddr, ref.udp.port);
+ flow = udp_at_sidx(sidx);
+ if (!flow) {
+ err("Unexpected cmsg reading error queue");
+ return -1;
+ }
+ flow->ts = now->tv_sec;
+ sidx = flow_sidx_opposite(sidx);
+ } else {
+ sidx = flow_sidx_opposite(ref.flowside);
+ }
+ toside = flowside_at_sidx(sidx);
+ udp_send_conn_fail_icmp4(c, ee, toside, udp_data, rc);
+ }
debug("%s error on UDP socket %i: %s",
str_ee_origin(ee), s, strerror_(ee->ee_errno));
@@ -461,15 +528,18 @@ static int udp_sock_recverr(int s)
/**
* udp_sock_errs() - Process errors on a socket
* @c: Execution context
- * @s: Socket to receive from
+ * @ref: epoll reference
* @events: epoll events bitmap
+ * @now: Current timestamp
*
* Return: Number of errors handled, or < 0 if we have an unrecoverable error
*/
-int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
+int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events,
+ const struct timespec *now)
{
unsigned n_err = 0;
socklen_t errlen;
+ int s = ref.fd;
int rc, err;
ASSERT(!c->no_udp);
@@ -478,7 +548,7 @@ int udp_sock_errs(const struct ctx *c, int s, uint32_t events)
return 0; /* Nothing to do */
/* Empty the error queue */
- while ((rc = udp_sock_recverr(s)) > 0)
+ while ((rc = udp_sock_recverr(c, ref, now)) > 0)
n_err += rc;
if (rc < 0)
@@ -558,7 +628,7 @@ static void udp_buf_listen_sock_handler(const struct ctx *c,
const socklen_t sasize = sizeof(udp_meta[0].s_in);
int n, i;
- if (udp_sock_errs(c, ref.fd, events) < 0) {
+ if (udp_sock_errs(c, ref, events, now) < 0) {
err("UDP: Unrecoverable error on listening socket:"
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
/* FIXME: what now? close/re-open socket? */
@@ -661,7 +731,7 @@ static void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
from_s = uflow->s[ref.flowside.sidei];
- if (udp_sock_errs(c, from_s, events) < 0) {
+ if (udp_sock_errs(c, ref, events, now) < 0) {
flow_err(uflow, "Unrecoverable error on reply socket");
flow_err_details(uflow);
udp_flow_close(c, uflow);
diff --git a/udp_internal.h b/udp_internal.h
index cc80e30..c5f8304 100644
--- a/udp_internal.h
+++ b/udp_internal.h
@@ -30,5 +30,6 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
const struct flowside *toside, size_t dlen,
bool no_udp_csum);
-int udp_sock_errs(const struct ctx *c, int s, uint32_t events);
+int udp_sock_errs(const struct ctx *c, union epoll_ref ref, uint32_t events,
+ const struct timespec *now);
#endif /* UDP_INTERNAL_H */
diff --git a/udp_vu.c b/udp_vu.c
index 4123510..0b1d3c6 100644
--- a/udp_vu.c
+++ b/udp_vu.c
@@ -227,7 +227,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int i;
- if (udp_sock_errs(c, ref.fd, events) < 0) {
+ if (udp_sock_errs(c, ref, events, now) < 0) {
err("UDP: Unrecoverable error on listening socket:"
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
return;
@@ -302,7 +302,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
ASSERT(!c->no_udp);
- if (udp_sock_errs(c, from_s, events) < 0) {
+ if (udp_sock_errs(c, ref, events, now) < 0) {
flow_err(uflow, "Unrecoverable error on reply socket");
flow_err_details(uflow);
udp_flow_close(c, uflow);
--
@@ -227,7 +227,7 @@ void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref,
struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE];
int i;
- if (udp_sock_errs(c, ref.fd, events) < 0) {
+ if (udp_sock_errs(c, ref, events, now) < 0) {
err("UDP: Unrecoverable error on listening socket:"
" (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port);
return;
@@ -302,7 +302,7 @@ void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref,
ASSERT(!c->no_udp);
- if (udp_sock_errs(c, from_s, events) < 0) {
+ if (udp_sock_errs(c, ref, events, now) < 0) {
flow_err(uflow, "Unrecoverable error on reply socket");
flow_err_details(uflow);
udp_flow_close(c, uflow);
--
2.48.1
next prev parent reply other threads:[~2025-02-19 19:30 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-19 19:30 [PATCH v3 0/2] Reconstruct ICMP headers for failed UDP connect Jon Maloy
2025-02-19 19:30 ` [PATCH v3 1/2] tap: break out building of udp header from tap_udp4_send function Jon Maloy
2025-02-20 1:08 ` David Gibson
2025-02-19 19:30 ` Jon Maloy [this message]
2025-02-20 3:13 ` [PATCH v3 2/2] udp: create and send ICMPv4 to local peer when applicable David Gibson
2025-02-20 3:47 ` [PATCH v3 0/2] Reconstruct ICMP headers for failed UDP connect David Gibson
2025-02-20 16:08 ` Jon Maloy
2025-02-21 2:25 ` David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250219193007.2336670-3-jmaloy@redhat.com \
--to=jmaloy@redhat.com \
--cc=dgibson@redhat.com \
--cc=lvivier@redhat.com \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).