* [PATCH v2 1/8] udp: Move sending pasta tap frames to the end of udp_sock_handler()
2022-12-15 1:30 [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
@ 2022-12-15 1:30 ` David Gibson
2022-12-15 1:30 ` [PATCH v2 2/8] udp: Split sending to passt tap interface into separate function David Gibson
` (7 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2022-12-15 1:30 UTC (permalink / raw)
To: passt-dev, Stefano Brivio; +Cc: David Gibson
udp_sock_handler() has a surprising difference in flow between pasta and
passt mode: For pasta we send each frame to the tap interface as we prepare
it. For passt, though, we prepare all the frames, then send them with a
single sendmmsg().
Alter the pasta path to also prepare all the frames, then send them at the
end. We already have a suitable data structure for the passt case. This
will make it easier to abstract out the tap backend difference in future.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
udp.c | 61 ++++++++++++++++++++++++++++++++++++++++-------------------
1 file changed, 42 insertions(+), 19 deletions(-)
diff --git a/udp.c b/udp.c
index 4610a02..a27fe88 100644
--- a/udp.c
+++ b/udp.c
@@ -783,6 +783,35 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
return buf_len;
}
+/**
+ * udp_tap_send_pasta() - Send datagrams to the pasta tap interface
+ * @c: Execution context
+ * @mmh: Array of message headers to send
+ * @n: Number of message headers to send
+ *
+ * #syscalls:pasta write
+ */
+static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
+ unsigned int n)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < mmh[i].msg_hdr.msg_iovlen; j++) {
+ struct iovec *iov = &mmh[i].msg_hdr.msg_iov[j];
+
+ /* We can't use writev() because the tap
+ * character device relies on the write()
+ * boundaries to discern frame boundaries
+ */
+ if (write(c->fd_tap, iov->iov_base, iov->iov_len) < 0)
+ debug("tap write: %s", strerror(errno));
+ else
+ pcap(iov->iov_base, iov->iov_len);
+ }
+ }
+}
+
/**
* udp_sock_handler() - Handle new data from socket
* @c: Execution context
@@ -835,31 +864,25 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
else
buf_len = udp_update_hdr4(c, i, dstport, now);
- if (c->mode == MODE_PASTA) {
- void *frame = tap_iov[i].iov_base;
+ tap_iov[i].iov_len = buf_len;
- if (write(c->fd_tap, frame, buf_len) < 0)
- debug("tap write: %s", strerror(errno));
- pcap(frame, buf_len);
- } else {
- tap_iov[i].iov_len = buf_len;
-
- /* With bigger messages, qemu closes the connection. */
- if (msg_bufs && msg_len + buf_len > SHRT_MAX) {
- tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
- msg_i++;
- tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i];
- msg_len = msg_bufs = 0;
- }
-
- msg_len += buf_len;
- msg_bufs++;
+ /* With bigger messages, qemu closes the connection. */
+ if (c->mode == MODE_PASST && msg_bufs &&
+ msg_len + buf_len > SHRT_MAX) {
+ tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
+ msg_i++;
+ tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i];
+ msg_len = msg_bufs = 0;
}
+ msg_len += buf_len;
+ msg_bufs++;
}
tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
- if (c->mode == MODE_PASTA)
+ if (c->mode == MODE_PASTA) {
+ udp_tap_send_pasta(c, tap_mmh, msg_i + 1);
return;
+ }
ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1,
MSG_NOSIGNAL | MSG_DONTWAIT);
--
@@ -783,6 +783,35 @@ static size_t udp_update_hdr6(const struct ctx *c, int n, in_port_t dstport,
return buf_len;
}
+/**
+ * udp_tap_send_pasta() - Send datagrams to the pasta tap interface
+ * @c: Execution context
+ * @mmh: Array of message headers to send
+ * @n: Number of message headers to send
+ *
+ * #syscalls:pasta write
+ */
+static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
+ unsigned int n)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < mmh[i].msg_hdr.msg_iovlen; j++) {
+ struct iovec *iov = &mmh[i].msg_hdr.msg_iov[j];
+
+ /* We can't use writev() because the tap
+ * character device relies on the write()
+ * boundaries to discern frame boundaries
+ */
+ if (write(c->fd_tap, iov->iov_base, iov->iov_len) < 0)
+ debug("tap write: %s", strerror(errno));
+ else
+ pcap(iov->iov_base, iov->iov_len);
+ }
+ }
+}
+
/**
* udp_sock_handler() - Handle new data from socket
* @c: Execution context
@@ -835,31 +864,25 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
else
buf_len = udp_update_hdr4(c, i, dstport, now);
- if (c->mode == MODE_PASTA) {
- void *frame = tap_iov[i].iov_base;
+ tap_iov[i].iov_len = buf_len;
- if (write(c->fd_tap, frame, buf_len) < 0)
- debug("tap write: %s", strerror(errno));
- pcap(frame, buf_len);
- } else {
- tap_iov[i].iov_len = buf_len;
-
- /* With bigger messages, qemu closes the connection. */
- if (msg_bufs && msg_len + buf_len > SHRT_MAX) {
- tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
- msg_i++;
- tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i];
- msg_len = msg_bufs = 0;
- }
-
- msg_len += buf_len;
- msg_bufs++;
+ /* With bigger messages, qemu closes the connection. */
+ if (c->mode == MODE_PASST && msg_bufs &&
+ msg_len + buf_len > SHRT_MAX) {
+ tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
+ msg_i++;
+ tap_mmh[msg_i].msg_hdr.msg_iov = &tap_iov[i];
+ msg_len = msg_bufs = 0;
}
+ msg_len += buf_len;
+ msg_bufs++;
}
tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
- if (c->mode == MODE_PASTA)
+ if (c->mode == MODE_PASTA) {
+ udp_tap_send_pasta(c, tap_mmh, msg_i + 1);
return;
+ }
ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1,
MSG_NOSIGNAL | MSG_DONTWAIT);
--
2.38.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH v2 2/8] udp: Split sending to passt tap interface into separate function
2022-12-15 1:30 [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
2022-12-15 1:30 ` [PATCH v2 1/8] udp: Move sending pasta tap frames to the end of udp_sock_handler() David Gibson
@ 2022-12-15 1:30 ` David Gibson
2022-12-15 1:30 ` [PATCH v2 3/8] udp: Split receive from preparation and send in udp_sock_handler() David Gibson
` (6 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2022-12-15 1:30 UTC (permalink / raw)
To: passt-dev, Stefano Brivio; +Cc: David Gibson
The last part of udp_sock_handler() does the actual sending of frames
to the tap interface. For pasta that's just a call to
udp_tap_send_pasta() but for passt, it's moderately complex and open
coded.
For symmetry, move the passt send path into its own function,
udp_tap_send_passt(). This will make it easier to abstract the tap
interface in future (e.g. when we want to add vhost-user).
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
udp.c | 130 ++++++++++++++++++++++++++++++++--------------------------
1 file changed, 72 insertions(+), 58 deletions(-)
diff --git a/udp.c b/udp.c
index a27fe88..7281bc3 100644
--- a/udp.c
+++ b/udp.c
@@ -812,6 +812,73 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
}
}
+/**
+ * udp_tap_send_passt() - Send datagrams to the passt tap interface
+ * @c: Execution context
+ * @mmh: Array of message headers to send
+ * @n: Number of message headers to send
+ *
+ * #syscalls:passt sendmmsg sendmsg
+ */
+static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
+{
+ struct msghdr *last_mh;
+ ssize_t missing = 0;
+ size_t msg_len = 0;
+ unsigned int i;
+ int ret;
+
+ ret = sendmmsg(c->fd_tap, mmh, n, MSG_NOSIGNAL | MSG_DONTWAIT);
+ if (ret <= 0)
+ return;
+
+ /* If we lose some messages to sendmmsg() here, fine, it's UDP. However,
+ * the last message needs to be delivered completely, otherwise qemu
+ * will fail to reassemble the next message and close the connection. Go
+ * through headers from the last sent message, counting bytes, and, if
+ * and as soon as we see more bytes than sendmmsg() sent, re-send the
+ * rest with a blocking call.
+ *
+ * In pictures, given this example:
+ *
+ * iov #0 iov #1 iov #2 iov #3
+ * tap_mmh[ret - 1].msg_hdr: .... ...... ..... ......
+ * tap_mmh[ret - 1].msg_len: 7 .... ...
+ *
+ * when 'msglen' reaches: 10 ^
+ * and 'missing' below is: 3 ---
+ *
+ * re-send everything from here: ^-- ----- ------
+ */
+ last_mh = &mmh[ret - 1].msg_hdr;
+ for (i = 0; i < last_mh->msg_iovlen; i++) {
+ if (missing <= 0) {
+ msg_len += last_mh->msg_iov[i].iov_len;
+ missing = msg_len - mmh[ret - 1].msg_len;
+ }
+
+ if (missing > 0) {
+ uint8_t **iov_base;
+ int first_offset;
+
+ iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base;
+ first_offset = last_mh->msg_iov[i].iov_len - missing;
+ *iov_base += first_offset;
+ last_mh->msg_iov[i].iov_len = missing;
+
+ last_mh->msg_iov = &last_mh->msg_iov[i];
+
+ if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0)
+ debug("UDP: %li bytes to tap missing", missing);
+
+ *iov_base -= first_offset;
+ break;
+ }
+ }
+
+ pcapmm(mmh, ret);
+}
+
/**
* udp_sock_handler() - Handle new data from socket
* @c: Execution context
@@ -820,16 +887,14 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
* @now: Current timestamp
*
* #syscalls recvmmsg
- * #syscalls:passt sendmmsg sendmsg
*/
void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now)
{
in_port_t dstport = ref.r.p.udp.udp.port;
- ssize_t n, msg_len = 0, missing = 0;
struct mmsghdr *tap_mmh, *sock_mmh;
- int msg_bufs = 0, msg_i = 0, ret;
- struct msghdr *last_mh;
+ int msg_bufs = 0, msg_i = 0;
+ ssize_t n, msg_len = 0;
struct iovec *tap_iov;
unsigned int i;
@@ -879,61 +944,10 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
}
tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
- if (c->mode == MODE_PASTA) {
+ if (c->mode == MODE_PASTA)
udp_tap_send_pasta(c, tap_mmh, msg_i + 1);
- return;
- }
-
- ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1,
- MSG_NOSIGNAL | MSG_DONTWAIT);
- if (ret <= 0)
- return;
-
- /* If we lose some messages to sendmmsg() here, fine, it's UDP. However,
- * the last message needs to be delivered completely, otherwise qemu
- * will fail to reassemble the next message and close the connection. Go
- * through headers from the last sent message, counting bytes, and, if
- * and as soon as we see more bytes than sendmmsg() sent, re-send the
- * rest with a blocking call.
- *
- * In pictures, given this example:
- *
- * iov #0 iov #1 iov #2 iov #3
- * tap_mmh[ret - 1].msg_hdr: .... ...... ..... ......
- * tap_mmh[ret - 1].msg_len: 7 .... ...
- *
- * when 'msglen' reaches: 10 ^
- * and 'missing' below is: 3 ---
- *
- * re-send everything from here: ^-- ----- ------
- */
- last_mh = &tap_mmh[ret - 1].msg_hdr;
- for (i = 0, msg_len = 0; i < last_mh->msg_iovlen; i++) {
- if (missing <= 0) {
- msg_len += last_mh->msg_iov[i].iov_len;
- missing = msg_len - tap_mmh[ret - 1].msg_len;
- }
-
- if (missing > 0) {
- uint8_t **iov_base;
- int first_offset;
-
- iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base;
- first_offset = last_mh->msg_iov[i].iov_len - missing;
- *iov_base += first_offset;
- last_mh->msg_iov[i].iov_len = missing;
-
- last_mh->msg_iov = &last_mh->msg_iov[i];
-
- if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0)
- debug("UDP: %li bytes to tap missing", missing);
-
- *iov_base -= first_offset;
- break;
- }
- }
-
- pcapmm(tap_mmh, ret);
+ else
+ udp_tap_send_passt(c, tap_mmh, msg_i + 1);
}
/**
--
@@ -812,6 +812,73 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
}
}
+/**
+ * udp_tap_send_passt() - Send datagrams to the passt tap interface
+ * @c: Execution context
+ * @mmh: Array of message headers to send
+ * @n: Number of message headers to send
+ *
+ * #syscalls:passt sendmmsg sendmsg
+ */
+static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
+{
+ struct msghdr *last_mh;
+ ssize_t missing = 0;
+ size_t msg_len = 0;
+ unsigned int i;
+ int ret;
+
+ ret = sendmmsg(c->fd_tap, mmh, n, MSG_NOSIGNAL | MSG_DONTWAIT);
+ if (ret <= 0)
+ return;
+
+ /* If we lose some messages to sendmmsg() here, fine, it's UDP. However,
+ * the last message needs to be delivered completely, otherwise qemu
+ * will fail to reassemble the next message and close the connection. Go
+ * through headers from the last sent message, counting bytes, and, if
+ * and as soon as we see more bytes than sendmmsg() sent, re-send the
+ * rest with a blocking call.
+ *
+ * In pictures, given this example:
+ *
+ * iov #0 iov #1 iov #2 iov #3
+ * tap_mmh[ret - 1].msg_hdr: .... ...... ..... ......
+ * tap_mmh[ret - 1].msg_len: 7 .... ...
+ *
+ * when 'msglen' reaches: 10 ^
+ * and 'missing' below is: 3 ---
+ *
+ * re-send everything from here: ^-- ----- ------
+ */
+ last_mh = &mmh[ret - 1].msg_hdr;
+ for (i = 0; i < last_mh->msg_iovlen; i++) {
+ if (missing <= 0) {
+ msg_len += last_mh->msg_iov[i].iov_len;
+ missing = msg_len - mmh[ret - 1].msg_len;
+ }
+
+ if (missing > 0) {
+ uint8_t **iov_base;
+ int first_offset;
+
+ iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base;
+ first_offset = last_mh->msg_iov[i].iov_len - missing;
+ *iov_base += first_offset;
+ last_mh->msg_iov[i].iov_len = missing;
+
+ last_mh->msg_iov = &last_mh->msg_iov[i];
+
+ if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0)
+ debug("UDP: %li bytes to tap missing", missing);
+
+ *iov_base -= first_offset;
+ break;
+ }
+ }
+
+ pcapmm(mmh, ret);
+}
+
/**
* udp_sock_handler() - Handle new data from socket
* @c: Execution context
@@ -820,16 +887,14 @@ static void udp_tap_send_pasta(const struct ctx *c, struct mmsghdr *mmh,
* @now: Current timestamp
*
* #syscalls recvmmsg
- * #syscalls:passt sendmmsg sendmsg
*/
void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now)
{
in_port_t dstport = ref.r.p.udp.udp.port;
- ssize_t n, msg_len = 0, missing = 0;
struct mmsghdr *tap_mmh, *sock_mmh;
- int msg_bufs = 0, msg_i = 0, ret;
- struct msghdr *last_mh;
+ int msg_bufs = 0, msg_i = 0;
+ ssize_t n, msg_len = 0;
struct iovec *tap_iov;
unsigned int i;
@@ -879,61 +944,10 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
}
tap_mmh[msg_i].msg_hdr.msg_iovlen = msg_bufs;
- if (c->mode == MODE_PASTA) {
+ if (c->mode == MODE_PASTA)
udp_tap_send_pasta(c, tap_mmh, msg_i + 1);
- return;
- }
-
- ret = sendmmsg(c->fd_tap, tap_mmh, msg_i + 1,
- MSG_NOSIGNAL | MSG_DONTWAIT);
- if (ret <= 0)
- return;
-
- /* If we lose some messages to sendmmsg() here, fine, it's UDP. However,
- * the last message needs to be delivered completely, otherwise qemu
- * will fail to reassemble the next message and close the connection. Go
- * through headers from the last sent message, counting bytes, and, if
- * and as soon as we see more bytes than sendmmsg() sent, re-send the
- * rest with a blocking call.
- *
- * In pictures, given this example:
- *
- * iov #0 iov #1 iov #2 iov #3
- * tap_mmh[ret - 1].msg_hdr: .... ...... ..... ......
- * tap_mmh[ret - 1].msg_len: 7 .... ...
- *
- * when 'msglen' reaches: 10 ^
- * and 'missing' below is: 3 ---
- *
- * re-send everything from here: ^-- ----- ------
- */
- last_mh = &tap_mmh[ret - 1].msg_hdr;
- for (i = 0, msg_len = 0; i < last_mh->msg_iovlen; i++) {
- if (missing <= 0) {
- msg_len += last_mh->msg_iov[i].iov_len;
- missing = msg_len - tap_mmh[ret - 1].msg_len;
- }
-
- if (missing > 0) {
- uint8_t **iov_base;
- int first_offset;
-
- iov_base = (uint8_t **)&last_mh->msg_iov[i].iov_base;
- first_offset = last_mh->msg_iov[i].iov_len - missing;
- *iov_base += first_offset;
- last_mh->msg_iov[i].iov_len = missing;
-
- last_mh->msg_iov = &last_mh->msg_iov[i];
-
- if (sendmsg(c->fd_tap, last_mh, MSG_NOSIGNAL) < 0)
- debug("UDP: %li bytes to tap missing", missing);
-
- *iov_base -= first_offset;
- break;
- }
- }
-
- pcapmm(tap_mmh, ret);
+ else
+ udp_tap_send_passt(c, tap_mmh, msg_i + 1);
}
/**
--
2.38.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH v2 3/8] udp: Split receive from preparation and send in udp_sock_handler()
2022-12-15 1:30 [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
2022-12-15 1:30 ` [PATCH v2 1/8] udp: Move sending pasta tap frames to the end of udp_sock_handler() David Gibson
2022-12-15 1:30 ` [PATCH v2 2/8] udp: Split sending to passt tap interface into separate function David Gibson
@ 2022-12-15 1:30 ` David Gibson
2022-12-15 1:30 ` [PATCH v2 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path David Gibson
` (5 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2022-12-15 1:30 UTC (permalink / raw)
To: passt-dev, Stefano Brivio; +Cc: David Gibson
The receive part of udp_sock_handler() and udp_sock_handler_splice() is now
almost identical. In preparation for merging that, split the receive part
of udp_sock_handler() from the part preparing and sending the frames for
sending on the tap interface. The latter goes into a new udp_tap_send()
function.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
udp.c | 79 +++++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 52 insertions(+), 27 deletions(-)
diff --git a/udp.c b/udp.c
index 7281bc3..64c9219 100644
--- a/udp.c
+++ b/udp.c
@@ -880,51 +880,39 @@ static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
}
/**
- * udp_sock_handler() - Handle new data from socket
+ * udp_tap_send() - Prepare UDP datagrams and send to tap interface
* @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
+ * @start: Index of first datagram in udp[46]_l2_buf pool
+ * @n: Number of datagrams to send
+ * @dstport: Destination port number
+ * @v6: True if using IPv6
* @now: Current timestamp
*
- * #syscalls recvmmsg
+ * Return: size of tap frame with headers
*/
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
- const struct timespec *now)
+static void udp_tap_send(const struct ctx *c,
+ unsigned int start, unsigned int n,
+ in_port_t dstport, bool v6, const struct timespec *now)
{
- in_port_t dstport = ref.r.p.udp.udp.port;
- struct mmsghdr *tap_mmh, *sock_mmh;
int msg_bufs = 0, msg_i = 0;
- ssize_t n, msg_len = 0;
+ struct mmsghdr *tap_mmh;
struct iovec *tap_iov;
+ ssize_t msg_len = 0;
unsigned int i;
- if (events == EPOLLERR)
- return;
-
- if (ref.r.p.udp.udp.splice) {
- udp_sock_handler_splice(c, ref, events, now);
- return;
- }
-
- if (ref.r.p.udp.udp.v6) {
+ if (v6) {
tap_mmh = udp6_l2_mh_tap;
- sock_mmh = udp6_l2_mh_sock;
tap_iov = udp6_l2_iov_tap;
} else {
tap_mmh = udp4_l2_mh_tap;
- sock_mmh = udp4_l2_mh_sock;
tap_iov = udp4_l2_iov_tap;
}
- n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
- if (n <= 0)
- return;
-
- tap_mmh[0].msg_hdr.msg_iov = &tap_iov[0];
- for (i = 0; i < (unsigned)n; i++) {
+ tap_mmh[0].msg_hdr.msg_iov = &tap_iov[start];
+ for (i = start; i < start + n; i++) {
size_t buf_len;
- if (ref.r.p.udp.udp.v6)
+ if (v6)
buf_len = udp_update_hdr6(c, i, dstport, now);
else
buf_len = udp_update_hdr4(c, i, dstport, now);
@@ -950,6 +938,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
udp_tap_send_passt(c, tap_mmh, msg_i + 1);
}
+/**
+ * udp_sock_handler() - Handle new data from socket
+ * @c: Execution context
+ * @ref: epoll reference
+ * @events: epoll events bitmap
+ * @now: Current timestamp
+ *
+ * #syscalls recvmmsg
+ */
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+ const struct timespec *now)
+{
+ in_port_t dstport = ref.r.p.udp.udp.port;
+ bool v6 = ref.r.p.udp.udp.v6;
+ struct mmsghdr *sock_mmh;
+ ssize_t n;
+
+ if (events == EPOLLERR)
+ return;
+
+ if (ref.r.p.udp.udp.splice) {
+ udp_sock_handler_splice(c, ref, events, now);
+ return;
+ }
+
+ if (ref.r.p.udp.udp.v6)
+ sock_mmh = udp6_l2_mh_sock;
+ else
+ sock_mmh = udp4_l2_mh_sock;
+
+ n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
+ if (n <= 0)
+ return;
+
+ udp_tap_send(c, 0, n, dstport, v6, now);
+}
+
/**
* udp_tap_handler() - Handle packets from tap
* @c: Execution context
--
@@ -880,51 +880,39 @@ static void udp_tap_send_passt(const struct ctx *c, struct mmsghdr *mmh, int n)
}
/**
- * udp_sock_handler() - Handle new data from socket
+ * udp_tap_send() - Prepare UDP datagrams and send to tap interface
* @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
+ * @start: Index of first datagram in udp[46]_l2_buf pool
+ * @n: Number of datagrams to send
+ * @dstport: Destination port number
+ * @v6: True if using IPv6
* @now: Current timestamp
*
- * #syscalls recvmmsg
+ * Return: size of tap frame with headers
*/
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
- const struct timespec *now)
+static void udp_tap_send(const struct ctx *c,
+ unsigned int start, unsigned int n,
+ in_port_t dstport, bool v6, const struct timespec *now)
{
- in_port_t dstport = ref.r.p.udp.udp.port;
- struct mmsghdr *tap_mmh, *sock_mmh;
int msg_bufs = 0, msg_i = 0;
- ssize_t n, msg_len = 0;
+ struct mmsghdr *tap_mmh;
struct iovec *tap_iov;
+ ssize_t msg_len = 0;
unsigned int i;
- if (events == EPOLLERR)
- return;
-
- if (ref.r.p.udp.udp.splice) {
- udp_sock_handler_splice(c, ref, events, now);
- return;
- }
-
- if (ref.r.p.udp.udp.v6) {
+ if (v6) {
tap_mmh = udp6_l2_mh_tap;
- sock_mmh = udp6_l2_mh_sock;
tap_iov = udp6_l2_iov_tap;
} else {
tap_mmh = udp4_l2_mh_tap;
- sock_mmh = udp4_l2_mh_sock;
tap_iov = udp4_l2_iov_tap;
}
- n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
- if (n <= 0)
- return;
-
- tap_mmh[0].msg_hdr.msg_iov = &tap_iov[0];
- for (i = 0; i < (unsigned)n; i++) {
+ tap_mmh[0].msg_hdr.msg_iov = &tap_iov[start];
+ for (i = start; i < start + n; i++) {
size_t buf_len;
- if (ref.r.p.udp.udp.v6)
+ if (v6)
buf_len = udp_update_hdr6(c, i, dstport, now);
else
buf_len = udp_update_hdr4(c, i, dstport, now);
@@ -950,6 +938,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
udp_tap_send_passt(c, tap_mmh, msg_i + 1);
}
+/**
+ * udp_sock_handler() - Handle new data from socket
+ * @c: Execution context
+ * @ref: epoll reference
+ * @events: epoll events bitmap
+ * @now: Current timestamp
+ *
+ * #syscalls recvmmsg
+ */
+void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+ const struct timespec *now)
+{
+ in_port_t dstport = ref.r.p.udp.udp.port;
+ bool v6 = ref.r.p.udp.udp.v6;
+ struct mmsghdr *sock_mmh;
+ ssize_t n;
+
+ if (events == EPOLLERR)
+ return;
+
+ if (ref.r.p.udp.udp.splice) {
+ udp_sock_handler_splice(c, ref, events, now);
+ return;
+ }
+
+ if (ref.r.p.udp.udp.v6)
+ sock_mmh = udp6_l2_mh_sock;
+ else
+ sock_mmh = udp4_l2_mh_sock;
+
+ n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
+ if (n <= 0)
+ return;
+
+ udp_tap_send(c, 0, n, dstport, v6, now);
+}
+
/**
* udp_tap_handler() - Handle packets from tap
* @c: Execution context
--
2.38.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH v2 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path
2022-12-15 1:30 [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
` (2 preceding siblings ...)
2022-12-15 1:30 ` [PATCH v2 3/8] udp: Split receive from preparation and send in udp_sock_handler() David Gibson
@ 2022-12-15 1:30 ` David Gibson
2022-12-15 1:30 ` [PATCH v2 5/8] udp: Pre-populate msg_names with local address David Gibson
` (4 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2022-12-15 1:30 UTC (permalink / raw)
To: passt-dev, Stefano Brivio; +Cc: David Gibson
Usually udp_sock_handler() will receive and forward multiple (up to
32) datagrams in udp_sock_handler(), then forward them all to the tap
interface. However, when in pasta mode we will only receive and
forward a single datagram at a time. Change it to receive multiple
datagrams at once, like the other paths.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
udp.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/udp.c b/udp.c
index 64c9219..24fa984 100644
--- a/udp.c
+++ b/udp.c
@@ -119,7 +119,6 @@
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
-#define UDP_TAP_FRAMES (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1)
/**
* struct udp_tap_port - Port tracking based on tap-facing source port
@@ -968,7 +967,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
else
sock_mmh = udp4_l2_mh_sock;
- n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
+ n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL);
if (n <= 0)
return;
--
@@ -119,7 +119,6 @@
#define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */
#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */
-#define UDP_TAP_FRAMES (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1)
/**
* struct udp_tap_port - Port tracking based on tap-facing source port
@@ -968,7 +967,7 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
else
sock_mmh = udp4_l2_mh_sock;
- n = recvmmsg(ref.r.s, sock_mmh, UDP_TAP_FRAMES, 0, NULL);
+ n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL);
if (n <= 0)
return;
--
2.38.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH v2 5/8] udp: Pre-populate msg_names with local address
2022-12-15 1:30 [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
` (3 preceding siblings ...)
2022-12-15 1:30 ` [PATCH v2 4/8] udp: Receive multiple datagrams at once on the pasta sock->tap path David Gibson
@ 2022-12-15 1:30 ` David Gibson
2022-12-15 1:30 ` [PATCH v2 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler() David Gibson
` (3 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2022-12-15 1:30 UTC (permalink / raw)
To: passt-dev, Stefano Brivio; +Cc: David Gibson
udp_splice_namebuf is now used only for spliced sending, and so it is
only ever populated with the localhost address, either IPv4 or IPv6.
So, replace the awkward initialization in udp_sock_handler_splice()
with statically initialized versions for IPv4 and IPv6. We then just
need to update the port number in udp_sock_handler_splice().
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
udp.c | 40 ++++++++++++++++++----------------------
util.h | 7 +++++++
2 files changed, 25 insertions(+), 22 deletions(-)
diff --git a/udp.c b/udp.c
index 24fa984..98d8687 100644
--- a/udp.c
+++ b/udp.c
@@ -232,11 +232,18 @@ static struct mmsghdr udp4_l2_mh_tap [UDP_MAX_FRAMES];
static struct mmsghdr udp6_l2_mh_tap [UDP_MAX_FRAMES];
/* recvmmsg()/sendmmsg() data for "spliced" connections */
-static struct sockaddr_storage udp_splice_namebuf;
-
static struct iovec udp4_iov_splice [UDP_MAX_FRAMES];
static struct iovec udp6_iov_splice [UDP_MAX_FRAMES];
+static struct sockaddr_in udp4_localname = {
+ .sin_family = AF_INET,
+ .sin_addr = IN4ADDR_LOOPBACK_INIT,
+};
+static struct sockaddr_in6 udp6_localname = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = IN6ADDR_LOOPBACK_INIT,
+};
+
static struct mmsghdr udp4_mh_splice [UDP_MAX_FRAMES];
static struct mmsghdr udp6_mh_splice [UDP_MAX_FRAMES];
@@ -610,23 +617,10 @@ static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref,
if (n <= 0)
return;
- if (v6) {
- *((struct sockaddr_in6 *)&udp_splice_namebuf) =
- ((struct sockaddr_in6) {
- .sin6_family = AF_INET6,
- .sin6_addr = IN6ADDR_LOOPBACK_INIT,
- .sin6_port = htons(dst),
- .sin6_scope_id = 0,
- });
- } else {
- *((struct sockaddr_in *)&udp_splice_namebuf) =
- ((struct sockaddr_in) {
- .sin_family = AF_INET,
- .sin_addr = { .s_addr = htonl(INADDR_LOOPBACK) },
- .sin_port = htons(dst),
- .sin_zero = { 0 },
- });
- }
+ if (v6)
+ udp6_localname.sin6_port = htons(dst);
+ else
+ udp4_localname.sin_port = htons(dst);
for (i = 0; i < n; i += m) {
in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
@@ -1252,9 +1246,11 @@ static void udp_splice_iov_init(void)
struct msghdr *mh4 = &udp4_mh_splice[i].msg_hdr;
struct msghdr *mh6 = &udp6_mh_splice[i].msg_hdr;
- mh4->msg_name = mh6->msg_name = &udp_splice_namebuf;
- mh4->msg_namelen = sizeof(udp_splice_namebuf);
- mh6->msg_namelen = sizeof(udp_splice_namebuf);
+ mh4->msg_name = &udp4_localname;
+ mh4->msg_namelen = sizeof(udp4_localname);
+
+ mh6->msg_name = &udp6_localname;
+ mh6->msg_namelen = sizeof(udp6_localname);
udp4_iov_splice[i].iov_base = udp4_l2_buf[i].data;
udp6_iov_splice[i].iov_base = udp6_l2_buf[i].data;
diff --git a/util.h b/util.h
index 02b55a9..3baec91 100644
--- a/util.h
+++ b/util.h
@@ -79,6 +79,13 @@
(IN_MULTICAST(ntohl((a)->s_addr)))
#define IN4_ARE_ADDR_EQUAL(a, b) \
(((struct in_addr *)(a))->s_addr == ((struct in_addr *)b)->s_addr)
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define IN4ADDR_LOOPBACK_INIT \
+ { .s_addr = INADDR_LOOPBACK }
+#else
+#define IN4ADDR_LOOPBACK_INIT \
+ { .s_addr = __bswap_constant_32(INADDR_LOOPBACK) }
+#endif
#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8)
int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
--
@@ -79,6 +79,13 @@
(IN_MULTICAST(ntohl((a)->s_addr)))
#define IN4_ARE_ADDR_EQUAL(a, b) \
(((struct in_addr *)(a))->s_addr == ((struct in_addr *)b)->s_addr)
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define IN4ADDR_LOOPBACK_INIT \
+ { .s_addr = INADDR_LOOPBACK }
+#else
+#define IN4ADDR_LOOPBACK_INIT \
+ { .s_addr = __bswap_constant_32(INADDR_LOOPBACK) }
+#endif
#define NS_FN_STACK_SIZE (RLIMIT_STACK_VAL * 1024 / 8)
int do_clone(int (*fn)(void *), char *stack_area, size_t stack_size, int flags,
--
2.38.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH v2 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler()
2022-12-15 1:30 [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
` (4 preceding siblings ...)
2022-12-15 1:30 ` [PATCH v2 5/8] udp: Pre-populate msg_names with local address David Gibson
@ 2022-12-15 1:30 ` David Gibson
2022-12-15 1:30 ` [PATCH v2 7/8] udp: Decide whether to "splice" per datagram rather than per socket David Gibson
` (2 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2022-12-15 1:30 UTC (permalink / raw)
To: passt-dev, Stefano Brivio; +Cc: David Gibson
These two functions now have a very similar structure, and their first
part (calling recvmmsg()) is functionally identical. So, merge the two
functions into one.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
udp.c | 86 +++++++++++++++++++----------------------------------------
1 file changed, 28 insertions(+), 58 deletions(-)
diff --git a/udp.c b/udp.c
index 98d8687..ed31216 100644
--- a/udp.c
+++ b/udp.c
@@ -590,52 +590,6 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL);
}
-/**
- * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection
- * @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
- */
-static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now)
-{
- in_port_t dst = ref.r.p.udp.udp.port;
- int v6 = ref.r.p.udp.udp.v6, n, i, m;
- struct mmsghdr *mmh_recv;
-
- if (!(events & EPOLLIN))
- return;
-
- if (v6)
- mmh_recv = udp6_l2_mh_sock;
- else
- mmh_recv = udp4_l2_mh_sock;
-
- n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
-
- if (n <= 0)
- return;
-
- if (v6)
- udp6_localname.sin6_port = htons(dst);
- else
- udp4_localname.sin_port = htons(dst);
-
- for (i = 0; i < n; i += m) {
- in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
-
- for (m = 1; i + m < n; m++) {
- void *mname = mmh_recv[i + m].msg_hdr.msg_name;
- if (sa_port(v6, mname) != src)
- break;
- }
-
- udp_splice_sendfrom(c, i, m, src, dst, v6, ref.r.p.udp.udp.ns,
- ref.r.p.udp.udp.orig, now);
- }
-}
-
/**
* udp_update_hdr4() - Update headers for one IPv4 datagram
* @c: Execution context
@@ -945,27 +899,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
{
in_port_t dstport = ref.r.p.udp.udp.port;
bool v6 = ref.r.p.udp.udp.v6;
- struct mmsghdr *sock_mmh;
+ struct mmsghdr *mmh_recv;
+ unsigned int i, m;
ssize_t n;
- if (events == EPOLLERR)
+ if (!(events & EPOLLIN))
return;
- if (ref.r.p.udp.udp.splice) {
- udp_sock_handler_splice(c, ref, events, now);
- return;
+ if (v6) {
+ mmh_recv = udp6_l2_mh_sock;
+ udp6_localname.sin6_port = htons(dstport);
+ } else {
+ mmh_recv = udp4_l2_mh_sock;
+ udp4_localname.sin_port = htons(dstport);
}
- if (ref.r.p.udp.udp.v6)
- sock_mmh = udp6_l2_mh_sock;
- else
- sock_mmh = udp4_l2_mh_sock;
-
- n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL);
+ n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
if (n <= 0)
return;
- udp_tap_send(c, 0, n, dstport, v6, now);
+ if (!ref.r.p.udp.udp.splice) {
+ udp_tap_send(c, 0, n, dstport, v6, now);
+ return;
+ }
+
+ for (i = 0; i < n; i += m) {
+ in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
+
+ for (m = 1; i + m < n; m++) {
+ void *mname = mmh_recv[i + m].msg_hdr.msg_name;
+ if (sa_port(v6, mname) != src)
+ break;
+ }
+
+ udp_splice_sendfrom(c, i, m, src, dstport, v6,
+ ref.r.p.udp.udp.ns, ref.r.p.udp.udp.orig,
+ now);
+ }
}
/**
--
@@ -590,52 +590,6 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
sendmmsg(s, mmh_send + start, n, MSG_NOSIGNAL);
}
-/**
- * udp_sock_handler_splice() - Handler for socket mapped to "spliced" connection
- * @c: Execution context
- * @ref: epoll reference
- * @events: epoll events bitmap
- * @now: Current timestamp
- */
-static void udp_sock_handler_splice(const struct ctx *c, union epoll_ref ref,
- uint32_t events, const struct timespec *now)
-{
- in_port_t dst = ref.r.p.udp.udp.port;
- int v6 = ref.r.p.udp.udp.v6, n, i, m;
- struct mmsghdr *mmh_recv;
-
- if (!(events & EPOLLIN))
- return;
-
- if (v6)
- mmh_recv = udp6_l2_mh_sock;
- else
- mmh_recv = udp4_l2_mh_sock;
-
- n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
-
- if (n <= 0)
- return;
-
- if (v6)
- udp6_localname.sin6_port = htons(dst);
- else
- udp4_localname.sin_port = htons(dst);
-
- for (i = 0; i < n; i += m) {
- in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
-
- for (m = 1; i + m < n; m++) {
- void *mname = mmh_recv[i + m].msg_hdr.msg_name;
- if (sa_port(v6, mname) != src)
- break;
- }
-
- udp_splice_sendfrom(c, i, m, src, dst, v6, ref.r.p.udp.udp.ns,
- ref.r.p.udp.udp.orig, now);
- }
-}
-
/**
* udp_update_hdr4() - Update headers for one IPv4 datagram
* @c: Execution context
@@ -945,27 +899,43 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
{
in_port_t dstport = ref.r.p.udp.udp.port;
bool v6 = ref.r.p.udp.udp.v6;
- struct mmsghdr *sock_mmh;
+ struct mmsghdr *mmh_recv;
+ unsigned int i, m;
ssize_t n;
- if (events == EPOLLERR)
+ if (!(events & EPOLLIN))
return;
- if (ref.r.p.udp.udp.splice) {
- udp_sock_handler_splice(c, ref, events, now);
- return;
+ if (v6) {
+ mmh_recv = udp6_l2_mh_sock;
+ udp6_localname.sin6_port = htons(dstport);
+ } else {
+ mmh_recv = udp4_l2_mh_sock;
+ udp4_localname.sin_port = htons(dstport);
}
- if (ref.r.p.udp.udp.v6)
- sock_mmh = udp6_l2_mh_sock;
- else
- sock_mmh = udp4_l2_mh_sock;
-
- n = recvmmsg(ref.r.s, sock_mmh, UDP_MAX_FRAMES, 0, NULL);
+ n = recvmmsg(ref.r.s, mmh_recv, UDP_MAX_FRAMES, 0, NULL);
if (n <= 0)
return;
- udp_tap_send(c, 0, n, dstport, v6, now);
+ if (!ref.r.p.udp.udp.splice) {
+ udp_tap_send(c, 0, n, dstport, v6, now);
+ return;
+ }
+
+ for (i = 0; i < n; i += m) {
+ in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
+
+ for (m = 1; i + m < n; m++) {
+ void *mname = mmh_recv[i + m].msg_hdr.msg_name;
+ if (sa_port(v6, mname) != src)
+ break;
+ }
+
+ udp_splice_sendfrom(c, i, m, src, dstport, v6,
+ ref.r.p.udp.udp.ns, ref.r.p.udp.udp.orig,
+ now);
+ }
}
/**
--
2.38.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH v2 7/8] udp: Decide whether to "splice" per datagram rather than per socket
2022-12-15 1:30 [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
` (5 preceding siblings ...)
2022-12-15 1:30 ` [PATCH v2 6/8] udp: Unify udp_sock_handler_splice() with udp_sock_handler() David Gibson
@ 2022-12-15 1:30 ` David Gibson
2022-12-15 1:30 ` [PATCH v2 8/8] udp: Don't use separate sockets to listen for spliced packets David Gibson
2022-12-16 0:58 ` [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications Stefano Brivio
8 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2022-12-15 1:30 UTC (permalink / raw)
To: passt-dev, Stefano Brivio; +Cc: David Gibson
Currently we have special sockets for receiving datagrams from locahost
which can use the optimized "splice" path rather than going across the tap
interface.
We want to loosen this so that sockets can receive sockets that will be
forwarded by both the spliced and non-spliced paths. To do this, we alter
the meaning of the @splice bit in the reference to mean that packets
receieved on this socket *can* be spliced, not that they *will* be spliced.
They'll only actually be spliced if they come from 127.0.0.1 or ::1.
We can't (for now) remove the splice bit entirely, unlike with TCP. Our
gateway mapping means that if the ns initiates communication to the gw
address, we'll translate that to target 127.0.0.1 on the host side. Reply
packets will therefore have source address 127.0.0.1 when received on the
host, but these need to go via the tap path where that will be translated
back to the gateway address. We need the @splice bit to distinguish that
case from packets going from localhost to a port mapped explicitly with
-u which should be spliced.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
udp.c | 52 +++++++++++++++++++++++++++++++++-------------------
udp.h | 2 +-
2 files changed, 34 insertions(+), 20 deletions(-)
diff --git a/udp.c b/udp.c
index ed31216..8ba3453 100644
--- a/udp.c
+++ b/udp.c
@@ -513,16 +513,25 @@ static int udp_splice_new_ns(void *arg)
}
/**
- * sa_port() - Determine port from a sockaddr_in or sockaddr_in6
+ * udp_mmh_splice_port() - Is source address of message suitable for splicing?
* @v6: Is @sa a sockaddr_in6 (otherwise sockaddr_in)?
- * @sa: Pointer to either sockaddr_in or sockaddr_in6
+ * @mmh: mmsghdr of incoming message
+ *
+ * Return: if @sa refers to localhost (127.0.0.1 or ::1) the port from
+ * @sa in host order, otherwise -1.
*/
-static in_port_t sa_port(bool v6, const void *sa)
+static int udp_mmh_splice_port(bool v6, const struct mmsghdr *mmh)
{
- const struct sockaddr_in6 *sa6 = sa;
- const struct sockaddr_in *sa4 = sa;
+ const struct sockaddr_in6 *sa6 = mmh->msg_hdr.msg_name;
+ const struct sockaddr_in *sa4 = mmh->msg_hdr.msg_name;
+
+ if (v6 && IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr))
+ return ntohs(sa6->sin6_port);
+
+ if (!v6 && IN4_IS_ADDR_LOOPBACK(&sa4->sin_addr))
+ return ntohs(sa4->sin_port);
- return v6 ? ntohs(sa6->sin6_port) : ntohs(sa4->sin_port);
+ return -1;
}
/**
@@ -918,23 +927,28 @@ void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
if (n <= 0)
return;
- if (!ref.r.p.udp.udp.splice) {
- udp_tap_send(c, 0, n, dstport, v6, now);
- return;
- }
-
for (i = 0; i < n; i += m) {
- in_port_t src = sa_port(v6, mmh_recv[i].msg_hdr.msg_name);
+ int splicefrom = -1;
+ m = n;
+
+ if (ref.r.p.udp.udp.splice) {
+ splicefrom = udp_mmh_splice_port(v6, mmh_recv + i);
+
+ for (m = 1; i + m < n; m++) {
+ int p;
- for (m = 1; i + m < n; m++) {
- void *mname = mmh_recv[i + m].msg_hdr.msg_name;
- if (sa_port(v6, mname) != src)
- break;
+ p = udp_mmh_splice_port(v6, mmh_recv + i + m);
+ if (p != splicefrom)
+ break;
+ }
}
- udp_splice_sendfrom(c, i, m, src, dstport, v6,
- ref.r.p.udp.udp.ns, ref.r.p.udp.udp.orig,
- now);
+ if (splicefrom >= 0)
+ udp_splice_sendfrom(c, i, m, splicefrom, dstport,
+ v6, ref.r.p.udp.udp.ns,
+ ref.r.p.udp.udp.orig, now);
+ else
+ udp_tap_send(c, i, m, dstport, v6, now);
}
}
diff --git a/udp.h b/udp.h
index 053991e..2a03335 100644
--- a/udp.h
+++ b/udp.h
@@ -22,7 +22,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
/**
* union udp_epoll_ref - epoll reference portion for TCP connections
* @bound: Set if this file descriptor is a bound socket
- * @splice: Set if descriptor is associated to "spliced" connection
+ * @splice: Set if descriptor packets to be "spliced"
* @orig: Set if a spliced socket which can originate "connections"
* @ns: Set if this is a socket in the pasta network namespace
* @v6: Set for IPv6 sockets or connections
--
@@ -22,7 +22,7 @@ void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s,
/**
* union udp_epoll_ref - epoll reference portion for TCP connections
* @bound: Set if this file descriptor is a bound socket
- * @splice: Set if descriptor is associated to "spliced" connection
+ * @splice: Set if descriptor packets to be "spliced"
* @orig: Set if a spliced socket which can originate "connections"
* @ns: Set if this is a socket in the pasta network namespace
* @v6: Set for IPv6 sockets or connections
--
2.38.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH v2 8/8] udp: Don't use separate sockets to listen for spliced packets
2022-12-15 1:30 [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
` (6 preceding siblings ...)
2022-12-15 1:30 ` [PATCH v2 7/8] udp: Decide whether to "splice" per datagram rather than per socket David Gibson
@ 2022-12-15 1:30 ` David Gibson
2022-12-16 0:58 ` [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications Stefano Brivio
8 siblings, 0 replies; 10+ messages in thread
From: David Gibson @ 2022-12-15 1:30 UTC (permalink / raw)
To: passt-dev, Stefano Brivio; +Cc: David Gibson
Currently, when ports are forwarded inbound in pasta mode, we open two
sockets for incoming traffic: one listens on the public IP address and will
forward packets to the tuntap interface. The other listens on localhost
and forwards via "splicing" (resending directly via sockets in the ns).
Now that we've improved the logic about whether we "splice" any individual
packet, we don't need this. Instead we can have a single socket bound to
0.0.0.0 or ::, marked as able to splice and udp_sock_handler() will deal
with each packet as appropriate.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
udp.c | 53 +++++++++++++----------------------------------------
1 file changed, 13 insertions(+), 40 deletions(-)
diff --git a/udp.c b/udp.c
index 8ba3453..228a086 100644
--- a/udp.c
+++ b/udp.c
@@ -1116,7 +1116,6 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const void *addr, const char *ifname, in_port_t port)
{
union udp_epoll_ref uref = { .u32 = 0 };
- const void *bind_addr;
int s;
if (ns) {
@@ -1128,67 +1127,41 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
}
if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
- if (!addr && c->mode == MODE_PASTA)
- bind_addr = &c->ip4.addr;
- else
- bind_addr = addr;
-
uref.udp.v6 = 0;
+ uref.udp.splice = (c->mode == MODE_PASTA);
+ uref.udp.orig = true;
if (!ns) {
- uref.udp.splice = 0;
- s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr, ifname,
+ s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, ifname,
port, uref.u32);
udp_tap_map[V4][uref.udp.port].sock = s;
-
- if (c->mode == MODE_PASTA) {
- bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) };
- uref.udp.splice = uref.udp.orig = true;
-
- s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr,
- ifname, port, uref.u32);
- udp_splice_init[V4][port].sock = s;
- }
+ udp_splice_init[V4][port].sock = s;
} else {
- uref.udp.splice = uref.udp.orig = uref.udp.ns = true;
-
- bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) };
+ struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
+ uref.udp.ns = true;
- s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr,
+ s = sock_l4(c, AF_INET, IPPROTO_UDP, &loopback,
ifname, port, uref.u32);
udp_splice_ns[V4][port].sock = s;
}
}
if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
- if (!addr && c->mode == MODE_PASTA)
- bind_addr = &c->ip6.addr;
- else
- bind_addr = addr;
-
uref.udp.v6 = 1;
+ uref.udp.splice = (c->mode == MODE_PASTA);
+ uref.udp.orig = true;
if (!ns) {
- uref.udp.splice = 0;
- s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, ifname,
+ s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, ifname,
port, uref.u32);
udp_tap_map[V6][uref.udp.port].sock = s;
-
- if (c->mode == MODE_PASTA) {
- bind_addr = &in6addr_loopback;
- uref.udp.splice = uref.udp.orig = true;
-
- s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr,
- ifname, port, uref.u32);
- udp_splice_init[V6][port].sock = s;
- }
+ udp_splice_init[V6][port].sock = s;
} else {
- bind_addr = &in6addr_loopback;
- uref.udp.splice = uref.udp.orig = uref.udp.ns = true;
+ uref.udp.ns = true;
- s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr,
+ s = sock_l4(c, AF_INET6, IPPROTO_UDP, &in6addr_loopback,
ifname, port, uref.u32);
udp_splice_ns[V6][port].sock = s;
}
--
@@ -1116,7 +1116,6 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
const void *addr, const char *ifname, in_port_t port)
{
union udp_epoll_ref uref = { .u32 = 0 };
- const void *bind_addr;
int s;
if (ns) {
@@ -1128,67 +1127,41 @@ void udp_sock_init(const struct ctx *c, int ns, sa_family_t af,
}
if ((af == AF_INET || af == AF_UNSPEC) && c->ifi4) {
- if (!addr && c->mode == MODE_PASTA)
- bind_addr = &c->ip4.addr;
- else
- bind_addr = addr;
-
uref.udp.v6 = 0;
+ uref.udp.splice = (c->mode == MODE_PASTA);
+ uref.udp.orig = true;
if (!ns) {
- uref.udp.splice = 0;
- s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr, ifname,
+ s = sock_l4(c, AF_INET, IPPROTO_UDP, addr, ifname,
port, uref.u32);
udp_tap_map[V4][uref.udp.port].sock = s;
-
- if (c->mode == MODE_PASTA) {
- bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) };
- uref.udp.splice = uref.udp.orig = true;
-
- s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr,
- ifname, port, uref.u32);
- udp_splice_init[V4][port].sock = s;
- }
+ udp_splice_init[V4][port].sock = s;
} else {
- uref.udp.splice = uref.udp.orig = uref.udp.ns = true;
-
- bind_addr = &(uint32_t){ htonl(INADDR_LOOPBACK) };
+ struct in_addr loopback = { htonl(INADDR_LOOPBACK) };
+ uref.udp.ns = true;
- s = sock_l4(c, AF_INET, IPPROTO_UDP, bind_addr,
+ s = sock_l4(c, AF_INET, IPPROTO_UDP, &loopback,
ifname, port, uref.u32);
udp_splice_ns[V4][port].sock = s;
}
}
if ((af == AF_INET6 || af == AF_UNSPEC) && c->ifi6) {
- if (!addr && c->mode == MODE_PASTA)
- bind_addr = &c->ip6.addr;
- else
- bind_addr = addr;
-
uref.udp.v6 = 1;
+ uref.udp.splice = (c->mode == MODE_PASTA);
+ uref.udp.orig = true;
if (!ns) {
- uref.udp.splice = 0;
- s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr, ifname,
+ s = sock_l4(c, AF_INET6, IPPROTO_UDP, addr, ifname,
port, uref.u32);
udp_tap_map[V6][uref.udp.port].sock = s;
-
- if (c->mode == MODE_PASTA) {
- bind_addr = &in6addr_loopback;
- uref.udp.splice = uref.udp.orig = true;
-
- s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr,
- ifname, port, uref.u32);
- udp_splice_init[V6][port].sock = s;
- }
+ udp_splice_init[V6][port].sock = s;
} else {
- bind_addr = &in6addr_loopback;
- uref.udp.splice = uref.udp.orig = uref.udp.ns = true;
+ uref.udp.ns = true;
- s = sock_l4(c, AF_INET6, IPPROTO_UDP, bind_addr,
+ s = sock_l4(c, AF_INET6, IPPROTO_UDP, &in6addr_loopback,
ifname, port, uref.u32);
udp_splice_ns[V6][port].sock = s;
}
--
2.38.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications
2022-12-15 1:30 [PATCH v2 0/8] Don't use additional sockets for receiving "spliced" UDP communications David Gibson
` (7 preceding siblings ...)
2022-12-15 1:30 ` [PATCH v2 8/8] udp: Don't use separate sockets to listen for spliced packets David Gibson
@ 2022-12-16 0:58 ` Stefano Brivio
8 siblings, 0 replies; 10+ messages in thread
From: Stefano Brivio @ 2022-12-16 0:58 UTC (permalink / raw)
To: David Gibson; +Cc: passt-dev
On Thu, 15 Dec 2022 12:30:10 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:
> At present, the UDP "splice" and "tap" paths are quite separate. We
> have separate sockets to receive packets bound for the tap and splice
> paths. This leads to some code duplication, and extra open sockets.
>
> This series partially unifies the two paths, allowing us to use a
> single (host side) socket, bound to 0.0.0.0 or :: to receive packets
> for both cases.
>
> This is based on my earlier series with some fixes for the tap path.
>
> Changes since v1:
> * Renamed udp_localname[46] to udp[46]_localname
> * Allow handling of UDP port 0
> * Fix a bug which could misidentify certain v6 packets as v4-spliceable
> * Some minor cosmetic fixes to code and commit messages
It looks good to me now, thanks. I'm still trying to figure out the
throughput (to tap) topic before applying -- results look pretty much
the same now, but I'm not sure why, and I'd like to have a second
look at perf(1) statistics.
--
Stefano
^ permalink raw reply [flat|nested] 10+ messages in thread