* [PATCH 1/8] tcp: inline tcp_l2_buf_fill_headers()
2024-05-27 9:10 [PATCH 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
@ 2024-05-27 9:10 ` Laurent Vivier
2024-05-27 9:10 ` [PATCH 2/8] tcp: extract buffer management from tcp_send_flag() Laurent Vivier
` (6 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Laurent Vivier @ 2024-05-27 9:10 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
It only calls tcp_fill_headers4() and tcp_fill_headers6() according
to the connection IP version.
We can inline them in tcp_data_to_tap() that already has a switch
on the IP version. In tcp_send_flag(), it will ease to separate code
from the common part and the buffer/vhost-user parts.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
tcp.c | 54 +++++++++++++++++++-----------------------------------
1 file changed, 19 insertions(+), 35 deletions(-)
diff --git a/tcp.c b/tcp.c
index 21d0af061aec..e948014c611d 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1405,37 +1405,6 @@ static size_t tcp_fill_headers6(const struct ctx *c,
return l4len;
}
-/**
- * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
- * @c: Execution context
- * @conn: Connection pointer
- * @iov: Pointer to an array of iovec of TCP pre-cooked buffers
- * @dlen: TCP payload length
- * @check: Checksum, if already known
- * @seq: Sequence number for this segment
- *
- * Return: IP payload length, host order
- */
-static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
- const struct tcp_tap_conn *conn,
- struct iovec *iov, size_t dlen,
- const uint16_t *check, uint32_t seq)
-{
- const struct in_addr *a4 = inany_v4(&conn->faddr);
-
- if (a4) {
- return tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, dlen,
- check, seq);
- }
-
- return tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, dlen,
- seq);
-}
-
/**
* tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
* @c: Execution context
@@ -1646,8 +1615,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
- l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
- conn->seq_to_tap);
+ if (CONN_V4(conn)) {
+ l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ NULL, conn->seq_to_tap);
+ } else {
+ l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ conn->seq_to_tap);
+ }
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (th->ack) {
@@ -2146,7 +2124,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp4_seq_update[tcp4_payload_used].len = dlen;
iov = tcp4_l2_iov[tcp4_payload_used++];
- l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
+ l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+ check, seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
@@ -2155,7 +2136,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp6_seq_update[tcp6_payload_used].len = dlen;
iov = tcp6_l2_iov[tcp6_payload_used++];
- l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
+ l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+ seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
--
@@ -1405,37 +1405,6 @@ static size_t tcp_fill_headers6(const struct ctx *c,
return l4len;
}
-/**
- * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
- * @c: Execution context
- * @conn: Connection pointer
- * @iov: Pointer to an array of iovec of TCP pre-cooked buffers
- * @dlen: TCP payload length
- * @check: Checksum, if already known
- * @seq: Sequence number for this segment
- *
- * Return: IP payload length, host order
- */
-static size_t tcp_l2_buf_fill_headers(const struct ctx *c,
- const struct tcp_tap_conn *conn,
- struct iovec *iov, size_t dlen,
- const uint16_t *check, uint32_t seq)
-{
- const struct in_addr *a4 = inany_v4(&conn->faddr);
-
- if (a4) {
- return tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, dlen,
- check, seq);
- }
-
- return tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, dlen,
- seq);
-}
-
/**
* tcp_update_seqack_wnd() - Update ACK sequence and window to guest/tap
* @c: Execution context
@@ -1646,8 +1615,17 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
- l4len = tcp_l2_buf_fill_headers(c, conn, iov, optlen, NULL,
- conn->seq_to_tap);
+ if (CONN_V4(conn)) {
+ l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ NULL, conn->seq_to_tap);
+ } else {
+ l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ conn->seq_to_tap);
+ }
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (th->ack) {
@@ -2146,7 +2124,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp4_seq_update[tcp4_payload_used].len = dlen;
iov = tcp4_l2_iov[tcp4_payload_used++];
- l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, check, seq);
+ l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+ check, seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
@@ -2155,7 +2136,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
tcp6_seq_update[tcp6_payload_used].len = dlen;
iov = tcp6_l2_iov[tcp6_payload_used++];
- l4len = tcp_l2_buf_fill_headers(c, conn, iov, dlen, NULL, seq);
+ l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+ seq);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
--
2.44.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/8] tcp: extract buffer management from tcp_send_flag()
2024-05-27 9:10 [PATCH 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
2024-05-27 9:10 ` [PATCH 1/8] tcp: inline tcp_l2_buf_fill_headers() Laurent Vivier
@ 2024-05-27 9:10 ` Laurent Vivier
2024-05-27 9:10 ` [PATCH 3/8] tcp: extract buffer management from tcp_conn_tap_mss() Laurent Vivier
` (5 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Laurent Vivier @ 2024-05-27 9:10 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
tcp.c | 87 ++++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 54 insertions(+), 33 deletions(-)
diff --git a/tcp.c b/tcp.c
index e948014c611d..b42cef69e8db 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1522,24 +1522,25 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
}
/**
- * tcp_send_flag() - Send segment with flags to tap (no payload)
+ * tcp_fill_flag_header() - Prepare header for flags-only segment (no payload)
* @c: Execution context
* @conn: Connection pointer
* @flags: TCP flags: if not set, send segment only if ACK is due
+ * @th: TCP header to update
+ * @data: buffer to store TCP option
+ * @optlen: size of the TCP option buffer
*
- * Return: negative error code on connection reset, 0 otherwise
+ * Return: < 0 error code on connection reset,
+ * 0 if there is no flag to send
+ * 1 otherwise
*/
-static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
+ int flags, struct tcphdr *th, char *data,
+ size_t *optlen)
{
- struct tcp_flags_t *payload;
struct tcp_info tinfo = { 0 };
socklen_t sl = sizeof(tinfo);
int s = conn->sock;
- size_t optlen = 0;
- struct tcphdr *th;
- struct iovec *iov;
- size_t l4len;
- char *data;
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
!flags && conn->wnd_to_tap)
@@ -1561,20 +1562,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
return 0;
- if (CONN_V4(conn))
- iov = tcp4_l2_flags_iov[tcp4_flags_used++];
- else
- iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-
- payload = iov[TCP_IOV_PAYLOAD].iov_base;
- th = &payload->th;
- data = payload->opts;
-
if (flags & SYN) {
int mss;
/* Options: MSS, NOP and window scale (8 bytes) */
- optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
+ *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
*data++ = OPT_MSS;
*data++ = OPT_MSS_LEN;
@@ -1608,26 +1600,13 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
flags |= ACK;
}
- th->doff = (sizeof(*th) + optlen) / 4;
+ th->doff = (sizeof(*th) + *optlen) / 4;
th->ack = !!(flags & ACK);
th->rst = !!(flags & RST);
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
- if (CONN_V4(conn)) {
- l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, optlen,
- NULL, conn->seq_to_tap);
- } else {
- l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, optlen,
- conn->seq_to_tap);
- }
- iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-
if (th->ack) {
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
conn_flag(c, conn, ~ACK_TO_TAP_DUE);
@@ -1642,6 +1621,48 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (th->fin || th->syn)
conn->seq_to_tap++;
+ return 1;
+}
+
+static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+ struct tcp_flags_t *payload;
+ size_t optlen = 0;
+ struct iovec *iov;
+ size_t l4len;
+ int ret;
+
+ if (CONN_V4(conn)) {
+ iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+
+ payload = iov[TCP_IOV_PAYLOAD].iov_base;
+
+ ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
+ payload->opts, &optlen);
+ if (ret <= 0)
+ return ret;
+
+ l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ NULL, conn->seq_to_tap);
+ } else {
+ iov = tcp6_l2_flags_iov[tcp6_flags_used++];
+
+ payload = iov[TCP_IOV_PAYLOAD].iov_base;
+
+ ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
+ payload->opts, &optlen);
+ if (ret <= 0)
+ return ret;
+
+ l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ conn->seq_to_tap);
+ }
+ iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+
if (flags & DUP_ACK) {
struct iovec *dup_iov;
int i;
--
@@ -1522,24 +1522,25 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
}
/**
- * tcp_send_flag() - Send segment with flags to tap (no payload)
+ * tcp_fill_flag_header() - Prepare header for flags-only segment (no payload)
* @c: Execution context
* @conn: Connection pointer
* @flags: TCP flags: if not set, send segment only if ACK is due
+ * @th: TCP header to update
+ * @data: buffer to store TCP option
+ * @optlen: size of the TCP option buffer
*
- * Return: negative error code on connection reset, 0 otherwise
+ * Return: < 0 error code on connection reset,
+ * 0 if there is no flag to send
+ * 1 otherwise
*/
-static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
+ int flags, struct tcphdr *th, char *data,
+ size_t *optlen)
{
- struct tcp_flags_t *payload;
struct tcp_info tinfo = { 0 };
socklen_t sl = sizeof(tinfo);
int s = conn->sock;
- size_t optlen = 0;
- struct tcphdr *th;
- struct iovec *iov;
- size_t l4len;
- char *data;
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap) &&
!flags && conn->wnd_to_tap)
@@ -1561,20 +1562,11 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (!tcp_update_seqack_wnd(c, conn, flags, &tinfo) && !flags)
return 0;
- if (CONN_V4(conn))
- iov = tcp4_l2_flags_iov[tcp4_flags_used++];
- else
- iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-
- payload = iov[TCP_IOV_PAYLOAD].iov_base;
- th = &payload->th;
- data = payload->opts;
-
if (flags & SYN) {
int mss;
/* Options: MSS, NOP and window scale (8 bytes) */
- optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
+ *optlen = OPT_MSS_LEN + 1 + OPT_WS_LEN;
*data++ = OPT_MSS;
*data++ = OPT_MSS_LEN;
@@ -1608,26 +1600,13 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
flags |= ACK;
}
- th->doff = (sizeof(*th) + optlen) / 4;
+ th->doff = (sizeof(*th) + *optlen) / 4;
th->ack = !!(flags & ACK);
th->rst = !!(flags & RST);
th->syn = !!(flags & SYN);
th->fin = !!(flags & FIN);
- if (CONN_V4(conn)) {
- l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, optlen,
- NULL, conn->seq_to_tap);
- } else {
- l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, optlen,
- conn->seq_to_tap);
- }
- iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-
if (th->ack) {
if (SEQ_GE(conn->seq_ack_to_tap, conn->seq_from_tap))
conn_flag(c, conn, ~ACK_TO_TAP_DUE);
@@ -1642,6 +1621,48 @@ static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
if (th->fin || th->syn)
conn->seq_to_tap++;
+ return 1;
+}
+
+static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+ struct tcp_flags_t *payload;
+ size_t optlen = 0;
+ struct iovec *iov;
+ size_t l4len;
+ int ret;
+
+ if (CONN_V4(conn)) {
+ iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+
+ payload = iov[TCP_IOV_PAYLOAD].iov_base;
+
+ ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
+ payload->opts, &optlen);
+ if (ret <= 0)
+ return ret;
+
+ l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ NULL, conn->seq_to_tap);
+ } else {
+ iov = tcp6_l2_flags_iov[tcp6_flags_used++];
+
+ payload = iov[TCP_IOV_PAYLOAD].iov_base;
+
+ ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
+ payload->opts, &optlen);
+ if (ret <= 0)
+ return ret;
+
+ l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ conn->seq_to_tap);
+ }
+ iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+
if (flags & DUP_ACK) {
struct iovec *dup_iov;
int i;
--
2.44.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 3/8] tcp: extract buffer management from tcp_conn_tap_mss()
2024-05-27 9:10 [PATCH 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
2024-05-27 9:10 ` [PATCH 1/8] tcp: inline tcp_l2_buf_fill_headers() Laurent Vivier
2024-05-27 9:10 ` [PATCH 2/8] tcp: extract buffer management from tcp_send_flag() Laurent Vivier
@ 2024-05-27 9:10 ` Laurent Vivier
2024-05-27 9:10 ` [PATCH 4/8] tcp: move buffers management functions to their own file Laurent Vivier
` (4 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Laurent Vivier @ 2024-05-27 9:10 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
tcp.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/tcp.c b/tcp.c
index b42cef69e8db..85a11e7e05dc 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1808,6 +1808,14 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
return s;
}
+static uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn)
+{
+ if (CONN_V4(conn))
+ return MSS4;
+
+ return MSS6;
+}
+
/**
* tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
* @c: Execution context
@@ -1853,10 +1861,7 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
else
mss = ret;
- if (CONN_V4(conn))
- mss = MIN(MSS4, mss);
- else
- mss = MIN(MSS6, mss);
+ mss = MIN(tcp_buf_conn_tap_mss(conn), mss);
return MIN(mss, USHRT_MAX);
}
--
@@ -1808,6 +1808,14 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
return s;
}
+static uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn)
+{
+ if (CONN_V4(conn))
+ return MSS4;
+
+ return MSS6;
+}
+
/**
* tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
* @c: Execution context
@@ -1853,10 +1861,7 @@ static uint16_t tcp_conn_tap_mss(const struct tcp_tap_conn *conn,
else
mss = ret;
- if (CONN_V4(conn))
- mss = MIN(MSS4, mss);
- else
- mss = MIN(MSS6, mss);
+ mss = MIN(tcp_buf_conn_tap_mss(conn), mss);
return MIN(mss, USHRT_MAX);
}
--
2.44.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 4/8] tcp: move buffers management functions to their own file
2024-05-27 9:10 [PATCH 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
` (2 preceding siblings ...)
2024-05-27 9:10 ` [PATCH 3/8] tcp: extract buffer management from tcp_conn_tap_mss() Laurent Vivier
@ 2024-05-27 9:10 ` Laurent Vivier
2024-05-27 9:10 ` [PATCH 5/8] tap: export pool_flush()/tapX_handler()/packet_add() Laurent Vivier
` (3 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Laurent Vivier @ 2024-05-27 9:10 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
Makefile | 5 +-
tcp.c | 587 ++-----------------------------------------------
tcp_buf.c | 543 +++++++++++++++++++++++++++++++++++++++++++++
tcp_buf.h | 17 ++
tcp_internal.h | 79 +++++++
5 files changed, 659 insertions(+), 572 deletions(-)
create mode 100644 tcp_buf.c
create mode 100644 tcp_buf.h
create mode 100644 tcp_internal.h
diff --git a/Makefile b/Makefile
index 8ea175762e36..1ac2e5e0053f 100644
--- a/Makefile
+++ b/Makefile
@@ -47,7 +47,7 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS)
PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \
icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \
ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \
- tcp_splice.c udp.c util.c
+ tcp_buf.c tcp_splice.c udp.c util.c
QRAP_SRCS = qrap.c
SRCS = $(PASST_SRCS) $(QRAP_SRCS)
@@ -56,7 +56,8 @@ MANPAGES = passt.1 pasta.1 qrap.1
PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \
lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \
- siphash.h tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h
+ siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \
+ udp.h util.h
HEADERS = $(PASST_HEADERS) seccomp.h
C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
diff --git a/tcp.c b/tcp.c
index 85a11e7e05dc..e538c4bf683b 100644
--- a/tcp.c
+++ b/tcp.c
@@ -302,32 +302,18 @@
#include "flow.h"
#include "flow_table.h"
+#include "tcp_internal.h"
+#include "tcp_buf.h"
/* Sides of a flow as we use them in "tap" connections */
#define SOCKSIDE 0
#define TAPSIDE 1
-#define TCP_FRAMES_MEM 128
-#define TCP_FRAMES \
- (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
-
#define TCP_HASH_TABLE_LOAD 70 /* % */
#define TCP_HASH_TABLE_SIZE (FLOW_MAX * 100 / TCP_HASH_TABLE_LOAD)
-#define MAX_WS 8
-#define MAX_WINDOW (1 << (16 + (MAX_WS)))
-
/* MSS rounding: see SET_MSS() */
#define MSS_DEFAULT 536
-#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
- sizeof(struct tcphdr) - \
- sizeof(struct iphdr), \
- sizeof(uint32_t))
-#define MSS6 ROUND_DOWN(IP_MAX_MTU - \
- sizeof(struct tcphdr) - \
- sizeof(struct ipv6hdr), \
- sizeof(uint32_t))
-
#define WINDOW_DEFAULT 14600 /* RFC 6928 */
#ifdef HAS_SND_WND
# define KERNEL_REPORTS_SND_WND(c) (c->tcp.kernel_snd_wnd)
@@ -349,31 +335,8 @@
*/
#define SOL_TCP IPPROTO_TCP
-#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
-#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
-#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
-#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
-
-#define FIN (1 << 0)
-#define SYN (1 << 1)
-#define RST (1 << 2)
-#define ACK (1 << 4)
-/* Flags for internal usage */
-#define DUP_ACK (1 << 5)
#define ACK_IF_NEEDED 0 /* See tcp_send_flag() */
-#define OPT_EOL 0
-#define OPT_NOP 1
-#define OPT_MSS 2
-#define OPT_MSS_LEN 4
-#define OPT_WS 3
-#define OPT_WS_LEN 3
-#define OPT_SACKP 4
-#define OPT_SACK 5
-#define OPT_TS 8
-
-#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
-#define CONN_V6(conn) (!CONN_V4(conn))
#define CONN_IS_CLOSING(conn) \
((conn->events & ESTABLISHED) && \
(conn->events & (SOCK_FIN_RCVD | TAP_FIN_RCVD)))
@@ -410,114 +373,7 @@ static int tcp_sock_ns [NUM_PORTS][IP_VERSIONS];
*/
static union inany_addr low_rtt_dst[LOW_RTT_TABLE_SIZE];
-/**
- * tcp_buf_seq_update - Sequences to update with length of frames once sent
- * @seq: Pointer to sequence number sent to tap-side, to be updated
- * @len: TCP payload length
- */
-struct tcp_buf_seq_update {
- uint32_t *seq;
- uint16_t len;
-};
-
-/* Static buffers */
-/**
- * struct tcp_payload_t - TCP header and data to send segments with payload
- * @th: TCP header
- * @data: TCP data
- */
-struct tcp_payload_t {
- struct tcphdr th;
- uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-/**
- * struct tcp_flags_t - TCP header and data to send zero-length
- * segments (flags)
- * @th: TCP header
- * @opts TCP options
- */
-struct tcp_flags_t {
- struct tcphdr th;
- char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
-#ifdef __AVX2__
-} __attribute__ ((packed, aligned(32)));
-#else
-} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
-#endif
-
-/* Ethernet header for IPv4 frames */
-static struct ethhdr tcp4_eth_src;
-
-static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
-/* IPv4 headers */
-static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
-/* TCP segments with payload for IPv4 frames */
-static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
-
-static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
-
-static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
-static unsigned int tcp4_payload_used;
-
-static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
-/* IPv4 headers for TCP segment without payload */
-static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
-/* TCP segments without payload for IPv4 frames */
-static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
-
-static unsigned int tcp4_flags_used;
-
-/* Ethernet header for IPv6 frames */
-static struct ethhdr tcp6_eth_src;
-
-static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
-/* IPv6 headers */
-static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
-/* TCP headers and data for IPv6 frames */
-static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
-
-static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
-
-static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
-static unsigned int tcp6_payload_used;
-
-static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
-/* IPv6 headers for TCP segment without payload */
-static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
-/* TCP segment without payload for IPv6 frames */
-static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
-
-static unsigned int tcp6_flags_used;
-
-/* recvmsg()/sendmsg() data for tap */
-static char tcp_buf_discard [MAX_WINDOW];
-static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
-
-/*
- * enum tcp_iov_parts - I/O vector parts for one TCP frame
- * @TCP_IOV_TAP tap backend specific header
- * @TCP_IOV_ETH Ethernet header
- * @TCP_IOV_IP IP (v4/v6) header
- * @TCP_IOV_PAYLOAD IP payload (TCP header + data)
- * @TCP_NUM_IOVS the number of entries in the iovec array
- */
-enum tcp_iov_parts {
- TCP_IOV_TAP = 0,
- TCP_IOV_ETH = 1,
- TCP_IOV_IP = 2,
- TCP_IOV_PAYLOAD = 3,
- TCP_NUM_IOVS
-};
-
-static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
-static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
+char tcp_buf_discard [MAX_WINDOW];
/* sendmsg() to socket */
static struct iovec tcp_iov [UIO_MAXIOV];
@@ -562,14 +418,6 @@ static uint32_t tcp_conn_epoll_events(uint8_t events, uint8_t conn_flags)
return EPOLLRDHUP;
}
-static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
- unsigned long flag);
-#define conn_flag(c, conn, flag) \
- do { \
- flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
- conn_flag_do(c, conn, flag); \
- } while (0)
-
/**
* tcp_epoll_ctl() - Add/modify/delete epoll state from connection events
* @c: Execution context
@@ -681,8 +529,8 @@ static void tcp_timer_ctl(const struct ctx *c, struct tcp_tap_conn *conn)
* @conn: Connection pointer
* @flag: Flag to set, or ~flag to unset
*/
-static void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
- unsigned long flag)
+void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long flag)
{
if (flag & (flag - 1)) {
int flag_index = fls(~flag);
@@ -732,8 +580,8 @@ static void tcp_hash_remove(const struct ctx *c,
* @conn: Connection pointer
* @event: Connection event
*/
-static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
- unsigned long event)
+void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long event)
{
int prev, new, num = fls(event);
@@ -781,12 +629,6 @@ static void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
tcp_timer_ctl(c, conn);
}
-#define conn_event(c, conn, event) \
- do { \
- flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
- conn_event_do(c, conn, event); \
- } while (0)
-
/**
* tcp_rtt_dst_low() - Check if low RTT was seen for connection endpoint
* @conn: Connection pointer
@@ -916,104 +758,6 @@ static void tcp_update_check_tcp6(struct ipv6hdr *ip6h, struct tcphdr *th)
th->check = csum(th, l4len, sum);
}
-/**
- * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
- * @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
- */
-void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
-{
- eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
- eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
-}
-
-/**
- * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
- * @c: Execution context
- */
-static void tcp_sock4_iov_init(const struct ctx *c)
-{
- struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
- struct iovec *iov;
- int i;
-
- tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
-
- for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
- tcp4_payload_ip[i] = iph;
- tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
- tcp4_payload[i].th.ack = 1;
- }
-
- for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
- tcp4_flags_ip[i] = iph;
- tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
- tcp4_flags[i].th.ack = 1;
- }
-
- for (i = 0; i < TCP_FRAMES_MEM; i++) {
- iov = tcp4_l2_iov[i];
-
- iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
- iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
- iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
- }
-
- for (i = 0; i < TCP_FRAMES_MEM; i++) {
- iov = tcp4_l2_flags_iov[i];
-
- iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
- iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
- iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
- iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
- }
-}
-
-/**
- * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
- * @c: Execution context
- */
-static void tcp_sock6_iov_init(const struct ctx *c)
-{
- struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
- struct iovec *iov;
- int i;
-
- tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
-
- for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
- tcp6_payload_ip[i] = ip6;
- tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
- tcp6_payload[i].th.ack = 1;
- }
-
- for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
- tcp6_flags_ip[i] = ip6;
- tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
- tcp6_flags[i].th .ack = 1;
- }
-
- for (i = 0; i < TCP_FRAMES_MEM; i++) {
- iov = tcp6_l2_iov[i];
-
- iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
- iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
- iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
- }
-
- for (i = 0; i < TCP_FRAMES_MEM; i++) {
- iov = tcp6_l2_flags_iov[i];
-
- iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
- iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
- iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
- iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
- }
-}
-
/**
* tcp_opt_get() - Get option, and value if any, from TCP header
* @opts: Pointer to start of TCP options in header
@@ -1239,50 +983,6 @@ bool tcp_flow_defer(union flow *flow)
return true;
}
-static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
-#define tcp_rst(c, conn) \
- do { \
- flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
- tcp_rst_do(c, conn); \
- } while (0)
-
-/**
- * tcp_flags_flush() - Send out buffers for segments with no data (flags)
- * @c: Execution context
- */
-static void tcp_flags_flush(const struct ctx *c)
-{
- tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
- tcp6_flags_used);
- tcp6_flags_used = 0;
-
- tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
- tcp4_flags_used);
- tcp4_flags_used = 0;
-}
-
-/**
- * tcp_payload_flush() - Send out buffers for segments with data
- * @c: Execution context
- */
-static void tcp_payload_flush(const struct ctx *c)
-{
- unsigned i;
- size_t m;
-
- m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
- tcp6_payload_used);
- for (i = 0; i < m; i++)
- *tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
- tcp6_payload_used = 0;
-
- m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
- tcp4_payload_used);
- for (i = 0; i < m; i++)
- *tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
- tcp4_payload_used = 0;
-}
-
/**
* tcp_defer_handler() - Handler for TCP deferred tasks
* @c: Execution context
@@ -1330,7 +1030,7 @@ static void tcp_fill_header(struct tcphdr *th,
*
* Return: The IPv4 payload length, host order
*/
-static size_t tcp_fill_headers4(const struct ctx *c,
+size_t tcp_fill_headers4(const struct ctx *c,
const struct tcp_tap_conn *conn,
struct tap_hdr *taph,
struct iphdr *iph, struct tcphdr *th,
@@ -1373,11 +1073,11 @@ static size_t tcp_fill_headers4(const struct ctx *c,
*
* Return: The IPv6 payload length, host order
*/
-static size_t tcp_fill_headers6(const struct ctx *c,
- const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
- struct ipv6hdr *ip6h, struct tcphdr *th,
- size_t dlen, uint32_t seq)
+size_t tcp_fill_headers6(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
+ struct tap_hdr *taph,
+ struct ipv6hdr *ip6h, struct tcphdr *th,
+ size_t dlen, uint32_t seq)
{
size_t l4len = dlen + sizeof(*th);
@@ -1414,8 +1114,8 @@ static size_t tcp_fill_headers6(const struct ctx *c,
*
* Return: 1 if sequence or window were updated, 0 otherwise
*/
-static int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
- int force_seq, struct tcp_info *tinfo)
+int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
+ int force_seq, struct tcp_info *tinfo)
{
uint32_t prev_wnd_to_tap = conn->wnd_to_tap << conn->ws_to_tap;
uint32_t prev_ack_to_tap = conn->seq_ack_to_tap;
@@ -1534,7 +1234,7 @@ static void tcp_update_seqack_from_tap(const struct ctx *c,
* 0 if there is no flag to send
* 1 otherwise
*/
-static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
+int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
int flags, struct tcphdr *th, char *data,
size_t *optlen)
{
@@ -1624,77 +1324,12 @@ static int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn,
return 1;
}
-static int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
-{
- struct tcp_flags_t *payload;
- size_t optlen = 0;
- struct iovec *iov;
- size_t l4len;
- int ret;
-
- if (CONN_V4(conn)) {
- iov = tcp4_l2_flags_iov[tcp4_flags_used++];
-
- payload = iov[TCP_IOV_PAYLOAD].iov_base;
-
- ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
- payload->opts, &optlen);
- if (ret <= 0)
- return ret;
-
- l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, optlen,
- NULL, conn->seq_to_tap);
- } else {
- iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-
- payload = iov[TCP_IOV_PAYLOAD].iov_base;
-
- ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
- payload->opts, &optlen);
- if (ret <= 0)
- return ret;
-
- l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, optlen,
- conn->seq_to_tap);
- }
- iov[TCP_IOV_PAYLOAD].iov_len = l4len;
-
- if (flags & DUP_ACK) {
- struct iovec *dup_iov;
- int i;
-
- if (CONN_V4(conn))
- dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
- else
- dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
-
- for (i = 0; i < TCP_NUM_IOVS; i++)
- memcpy(dup_iov[i].iov_base, iov[i].iov_base,
- iov[i].iov_len);
- dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
- }
-
- if (CONN_V4(conn)) {
- if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
- tcp_flags_flush(c);
- } else {
- if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
- tcp_flags_flush(c);
- }
-
- return 0;
-}
-
/**
* tcp_rst_do() - Reset a tap connection: send RST segment to tap, close socket
* @c: Execution context
* @conn: Connection pointer
*/
-static void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
+void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn)
{
if (conn->events == CLOSED)
return;
@@ -1808,14 +1443,6 @@ static int tcp_conn_new_sock(const struct ctx *c, sa_family_t af)
return s;
}
-static uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn)
-{
- if (CONN_V4(conn))
- return MSS4;
-
- return MSS6;
-}
-
/**
* tcp_conn_sock() - Obtain a connectable socket in the host/init namespace
* @c: Execution context
@@ -2122,186 +1749,6 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq)
return 0;
}
-/**
- * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
- * @c: Execution context
- * @conn: Connection pointer
- * @dlen: TCP payload length
- * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
- * @seq: Sequence number to be sent
- */
-static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
- ssize_t dlen, int no_csum, uint32_t seq)
-{
- uint32_t *seq_update = &conn->seq_to_tap;
- struct iovec *iov;
- size_t l4len;
-
- if (CONN_V4(conn)) {
- struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
- const uint16_t *check = NULL;
-
- if (no_csum) {
- struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
- check = &iph->check;
- }
-
- tcp4_seq_update[tcp4_payload_used].seq = seq_update;
- tcp4_seq_update[tcp4_payload_used].len = dlen;
-
- iov = tcp4_l2_iov[tcp4_payload_used++];
- l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, dlen,
- check, seq);
- iov[TCP_IOV_PAYLOAD].iov_len = l4len;
- if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
- tcp_payload_flush(c);
- } else if (CONN_V6(conn)) {
- tcp6_seq_update[tcp6_payload_used].seq = seq_update;
- tcp6_seq_update[tcp6_payload_used].len = dlen;
-
- iov = tcp6_l2_iov[tcp6_payload_used++];
- l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
- iov[TCP_IOV_IP].iov_base,
- iov[TCP_IOV_PAYLOAD].iov_base, dlen,
- seq);
- iov[TCP_IOV_PAYLOAD].iov_len = l4len;
- if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
- tcp_payload_flush(c);
- }
-}
-
-/**
- * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
- * @c: Execution context
- * @conn: Connection pointer
- *
- * Return: negative on connection reset, 0 otherwise
- *
- * #syscalls recvmsg
- */
-static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
-{
- uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
- int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
- int sendlen, len, dlen, v4 = CONN_V4(conn);
- int s = conn->sock, i, ret = 0;
- struct msghdr mh_sock = { 0 };
- uint16_t mss = MSS_GET(conn);
- uint32_t already_sent, seq;
- struct iovec *iov;
-
- already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
-
- if (SEQ_LT(already_sent, 0)) {
- /* RFC 761, section 2.1. */
- flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
- conn->seq_ack_from_tap, conn->seq_to_tap);
- conn->seq_to_tap = conn->seq_ack_from_tap;
- already_sent = 0;
- }
-
- if (!wnd_scaled || already_sent >= wnd_scaled) {
- conn_flag(c, conn, STALLED);
- conn_flag(c, conn, ACK_FROM_TAP_DUE);
- return 0;
- }
-
- /* Set up buffer descriptors we'll fill completely and partially. */
- fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
- if (fill_bufs > TCP_FRAMES) {
- fill_bufs = TCP_FRAMES;
- iov_rem = 0;
- } else {
- iov_rem = (wnd_scaled - already_sent) % mss;
- }
-
- mh_sock.msg_iov = iov_sock;
- mh_sock.msg_iovlen = fill_bufs + 1;
-
- iov_sock[0].iov_base = tcp_buf_discard;
- iov_sock[0].iov_len = already_sent;
-
- if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
- (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
- tcp_payload_flush(c);
-
- /* Silence Coverity CWE-125 false positive */
- tcp4_payload_used = tcp6_payload_used = 0;
- }
-
- for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
- if (v4)
- iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
- else
- iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
- iov->iov_len = mss;
- }
- if (iov_rem)
- iov_sock[fill_bufs].iov_len = iov_rem;
-
- /* Receive into buffers, don't dequeue until acknowledged by guest. */
- do
- len = recvmsg(s, &mh_sock, MSG_PEEK);
- while (len < 0 && errno == EINTR);
-
- if (len < 0)
- goto err;
-
- if (!len) {
- if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
- if ((ret = tcp_send_flag(c, conn, FIN | ACK))) {
- tcp_rst(c, conn);
- return ret;
- }
-
- conn_event(c, conn, TAP_FIN_SENT);
- }
-
- return 0;
- }
-
- sendlen = len - already_sent;
- if (sendlen <= 0) {
- conn_flag(c, conn, STALLED);
- return 0;
- }
-
- conn_flag(c, conn, ~STALLED);
-
- send_bufs = DIV_ROUND_UP(sendlen, mss);
- last_len = sendlen - (send_bufs - 1) * mss;
-
- /* Likely, some new data was acked too. */
- tcp_update_seqack_wnd(c, conn, 0, NULL);
-
- /* Finally, queue to tap */
- dlen = mss;
- seq = conn->seq_to_tap;
- for (i = 0; i < send_bufs; i++) {
- int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
-
- if (i == send_bufs - 1)
- dlen = last_len;
-
- tcp_data_to_tap(c, conn, dlen, no_csum, seq);
- seq += dlen;
- }
-
- conn_flag(c, conn, ACK_FROM_TAP_DUE);
-
- return 0;
-
-err:
- if (errno != EAGAIN && errno != EWOULDBLOCK) {
- ret = -errno;
- tcp_rst(c, conn);
- }
-
- return ret;
-}
-
/**
* tcp_data_from_tap() - tap/guest data for established connection
* @c: Execution context
diff --git a/tcp_buf.c b/tcp_buf.c
new file mode 100644
index 000000000000..ea1e72875ec5
--- /dev/null
+++ b/tcp_buf.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* PASST - Plug A Simple Socket Transport
+ * for qemu/UNIX domain socket mode
+ *
+ * PASTA - Pack A Subtle Tap Abstraction
+ * for network namespace/tap device mode
+ *
+ * tcp_buf.c - TCP L2-L4 translation state machine
+ *
+ * Copyright (c) 2020-2022 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <limits.h>
+#include <string.h>
+#include <errno.h>
+
+#include <netinet/ip.h>
+
+#include <linux/tcp.h>
+
+#include "util.h"
+#include "ip.h"
+#include "iov.h"
+#include "passt.h"
+#include "tap.h"
+#include "siphash.h"
+#include "inany.h"
+#include "tcp_conn.h"
+#include "tcp_internal.h"
+#include "tcp_buf.h"
+
+#define TCP_FRAMES_MEM 128
+#define TCP_FRAMES \
+ (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
+
+#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
+ sizeof(struct tcphdr) - \
+ sizeof(struct iphdr), \
+ sizeof(uint32_t))
+#define MSS6 ROUND_DOWN(IP_MAX_MTU - \
+ sizeof(struct tcphdr) - \
+ sizeof(struct ipv6hdr), \
+ sizeof(uint32_t))
+
+/**
+ * tcp_buf_seq_update - Sequences to update with length of frames once sent
+ * @seq: Pointer to sequence number sent to tap-side, to be updated
+ * @len: TCP payload length
+ */
+struct tcp_buf_seq_update {
+ uint32_t *seq;
+ uint16_t len;
+};
+
+/* Static buffers */
+/**
+ * struct tcp_payload_t - TCP header and data to send segments with payload
+ * @th: TCP header
+ * @data: TCP data
+ */
+struct tcp_payload_t {
+ struct tcphdr th;
+ uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32))); /* For AVX2 checksum routines */
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
+/**
+ * struct tcp_flags_t - TCP header and data to send zero-length
+ * segments (flags)
+ * @th: TCP header
+ * @opts TCP options
+ */
+struct tcp_flags_t {
+ struct tcphdr th;
+ char opts[OPT_MSS_LEN + OPT_WS_LEN + 1];
+#ifdef __AVX2__
+} __attribute__ ((packed, aligned(32)));
+#else
+} __attribute__ ((packed, aligned(__alignof__(unsigned int))));
+#endif
+
+/* Ethernet header for IPv4 frames */
+static struct ethhdr tcp4_eth_src;
+
+static struct tap_hdr tcp4_payload_tap_hdr[TCP_FRAMES_MEM];
+/* IPv4 headers */
+static struct iphdr tcp4_payload_ip[TCP_FRAMES_MEM];
+/* TCP segments with payload for IPv4 frames */
+static struct tcp_payload_t tcp4_payload[TCP_FRAMES_MEM];
+
+static_assert(MSS4 <= sizeof(tcp4_payload[0].data), "MSS4 is greater than 65516");
+
+static struct tcp_buf_seq_update tcp4_seq_update[TCP_FRAMES_MEM];
+static unsigned int tcp4_payload_used;
+
+static struct tap_hdr tcp4_flags_tap_hdr[TCP_FRAMES_MEM];
+/* IPv4 headers for TCP segment without payload */
+static struct iphdr tcp4_flags_ip[TCP_FRAMES_MEM];
+/* TCP segments without payload for IPv4 frames */
+static struct tcp_flags_t tcp4_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp4_flags_used;
+
+/* Ethernet header for IPv6 frames */
+static struct ethhdr tcp6_eth_src;
+
+static struct tap_hdr tcp6_payload_tap_hdr[TCP_FRAMES_MEM];
+/* IPv6 headers */
+static struct ipv6hdr tcp6_payload_ip[TCP_FRAMES_MEM];
+/* TCP headers and data for IPv6 frames */
+static struct tcp_payload_t tcp6_payload[TCP_FRAMES_MEM];
+
+static_assert(MSS6 <= sizeof(tcp6_payload[0].data), "MSS6 is greater than 65516");
+
+static struct tcp_buf_seq_update tcp6_seq_update[TCP_FRAMES_MEM];
+static unsigned int tcp6_payload_used;
+
+static struct tap_hdr tcp6_flags_tap_hdr[TCP_FRAMES_MEM];
+/* IPv6 headers for TCP segment without payload */
+static struct ipv6hdr tcp6_flags_ip[TCP_FRAMES_MEM];
+/* TCP segment without payload for IPv6 frames */
+static struct tcp_flags_t tcp6_flags[TCP_FRAMES_MEM];
+
+static unsigned int tcp6_flags_used;
+
+/* recvmsg()/sendmsg() data for tap */
+static struct iovec iov_sock [TCP_FRAMES_MEM + 1];
+
+/*
+ * enum tcp_iov_parts - I/O vector parts for one TCP frame
+ * @TCP_IOV_TAP tap backend specific header
+ * @TCP_IOV_ETH Ethernet header
+ * @TCP_IOV_IP IP (v4/v6) header
+ * @TCP_IOV_PAYLOAD IP payload (TCP header + data)
+ * @TCP_NUM_IOVS the number of entries in the iovec array
+ */
+enum tcp_iov_parts {
+ TCP_IOV_TAP = 0,
+ TCP_IOV_ETH = 1,
+ TCP_IOV_IP = 2,
+ TCP_IOV_PAYLOAD = 3,
+ TCP_NUM_IOVS
+};
+
+static struct iovec tcp4_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec tcp6_l2_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec tcp4_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
+static struct iovec tcp6_l2_flags_iov [TCP_FRAMES_MEM][TCP_NUM_IOVS];
+
+/**
+ * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
+ * @eth_d: Ethernet destination address, NULL if unchanged
+ * @eth_s: Ethernet source address, NULL if unchanged
+ */
+void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+{
+ eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
+ eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
+}
+
+/**
+ * tcp_sock4_iov_init() - Initialise scatter-gather L2 buffers for IPv4 sockets
+ * @c: Execution context
+ */
+void tcp_sock4_iov_init(const struct ctx *c)
+{
+ struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
+ struct iovec *iov;
+ int i;
+
+ tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
+
+ for (i = 0; i < ARRAY_SIZE(tcp4_payload); i++) {
+ tcp4_payload_ip[i] = iph;
+ tcp4_payload[i].th.doff = sizeof(struct tcphdr) / 4;
+ tcp4_payload[i].th.ack = 1;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(tcp4_flags); i++) {
+ tcp4_flags_ip[i] = iph;
+ tcp4_flags[i].th.doff = sizeof(struct tcphdr) / 4;
+ tcp4_flags[i].th.ack = 1;
+ }
+
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
+ iov = tcp4_l2_iov[i];
+
+ iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_payload_tap_hdr[i]);
+ iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[i]);
+ iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_payload[i];
+ }
+
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
+ iov = tcp4_l2_flags_iov[i];
+
+ iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp4_flags_tap_hdr[i]);
+ iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
+ iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp4_eth_src);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_flags_ip[i]);
+ iov[TCP_IOV_PAYLOAD].iov_base = &tcp4_flags[i];
+ }
+}
+
+/**
+ * tcp_sock6_iov_init() - Initialise scatter-gather L2 buffers for IPv6 sockets
+ * @c: Execution context
+ */
+void tcp_sock6_iov_init(const struct ctx *c)
+{
+ struct ipv6hdr ip6 = L2_BUF_IP6_INIT(IPPROTO_TCP);
+ struct iovec *iov;
+ int i;
+
+ tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
+
+ for (i = 0; i < ARRAY_SIZE(tcp6_payload); i++) {
+ tcp6_payload_ip[i] = ip6;
+ tcp6_payload[i].th.doff = sizeof(struct tcphdr) / 4;
+ tcp6_payload[i].th.ack = 1;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(tcp6_flags); i++) {
+ tcp6_flags_ip[i] = ip6;
+ tcp6_flags[i].th.doff = sizeof(struct tcphdr) / 4;
+ tcp6_flags[i].th .ack = 1;
+ }
+
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
+ iov = tcp6_l2_iov[i];
+
+ iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_payload_tap_hdr[i]);
+ iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[i]);
+ iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_payload[i];
+ }
+
+ for (i = 0; i < TCP_FRAMES_MEM; i++) {
+ iov = tcp6_l2_flags_iov[i];
+
+ iov[TCP_IOV_TAP] = tap_hdr_iov(c, &tcp6_flags_tap_hdr[i]);
+ iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp6_eth_src);
+ iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_flags_ip[i]);
+ iov[TCP_IOV_PAYLOAD].iov_base = &tcp6_flags[i];
+ }
+}
+
+/**
+ * tcp_flags_flush() - Send out buffers for segments with no data (flags)
+ * @c: Execution context
+ */
+void tcp_flags_flush(const struct ctx *c)
+{
+ tap_send_frames(c, &tcp6_l2_flags_iov[0][0], TCP_NUM_IOVS,
+ tcp6_flags_used);
+ tcp6_flags_used = 0;
+
+ tap_send_frames(c, &tcp4_l2_flags_iov[0][0], TCP_NUM_IOVS,
+ tcp4_flags_used);
+ tcp4_flags_used = 0;
+}
+
+/**
+ * tcp_payload_flush() - Send out buffers for segments with data
+ * @c: Execution context
+ */
+void tcp_payload_flush(const struct ctx *c)
+{
+ unsigned i;
+ size_t m;
+
+ m = tap_send_frames(c, &tcp6_l2_iov[0][0], TCP_NUM_IOVS,
+ tcp6_payload_used);
+ for (i = 0; i < m; i++)
+ *tcp6_seq_update[i].seq += tcp6_seq_update[i].len;
+ tcp6_payload_used = 0;
+
+ m = tap_send_frames(c, &tcp4_l2_iov[0][0], TCP_NUM_IOVS,
+ tcp4_payload_used);
+ for (i = 0; i < m; i++)
+ *tcp4_seq_update[i].seq += tcp4_seq_update[i].len;
+ tcp4_payload_used = 0;
+}
+
+int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags)
+{
+ struct tcp_flags_t *payload;
+ size_t optlen = 0;
+ struct iovec *iov;
+ size_t l4len;
+ int ret;
+
+ if (CONN_V4(conn)) {
+ iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+
+ payload = iov[TCP_IOV_PAYLOAD].iov_base;
+
+ ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
+ payload->opts, &optlen);
+ if (ret <= 0)
+ return ret;
+
+ l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ NULL, conn->seq_to_tap);
+ } else {
+ iov = tcp6_l2_flags_iov[tcp6_flags_used++];
+
+ payload = iov[TCP_IOV_PAYLOAD].iov_base;
+
+ ret = tcp_fill_flag_header(c, conn, flags, &payload->th,
+ payload->opts, &optlen);
+ if (ret <= 0)
+ return ret;
+
+ l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, optlen,
+ conn->seq_to_tap);
+ }
+ iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+
+ if (flags & DUP_ACK) {
+ struct iovec *dup_iov;
+ int i;
+
+ if (CONN_V4(conn))
+ dup_iov = tcp4_l2_flags_iov[tcp4_flags_used++];
+ else
+ dup_iov = tcp6_l2_flags_iov[tcp6_flags_used++];
+
+ for (i = 0; i < TCP_NUM_IOVS; i++)
+ memcpy(dup_iov[i].iov_base, iov[i].iov_base,
+ iov[i].iov_len);
+ dup_iov[TCP_IOV_PAYLOAD].iov_len = iov[TCP_IOV_PAYLOAD].iov_len;
+ }
+
+ if (CONN_V4(conn)) {
+ if (tcp4_flags_used > TCP_FRAMES_MEM - 2)
+ tcp_flags_flush(c);
+ } else {
+ if (tcp6_flags_used > TCP_FRAMES_MEM - 2)
+ tcp_flags_flush(c);
+ }
+
+ return 0;
+}
+
+uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn)
+{
+ if (CONN_V4(conn))
+ return MSS4;
+
+ return MSS6;
+}
+
+/**
+ * tcp_data_to_tap() - Finalise (queue) highest-numbered scatter-gather buffer
+ * @c: Execution context
+ * @conn: Connection pointer
+ * @dlen: TCP payload length
+ * @no_csum: Don't compute IPv4 checksum, use the one from previous buffer
+ * @seq: Sequence number to be sent
+ */
+void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
+ ssize_t dlen, int no_csum, uint32_t seq)
+{
+ uint32_t *seq_update = &conn->seq_to_tap;
+ struct iovec *iov;
+ size_t l4len;
+
+ if (CONN_V4(conn)) {
+ struct iovec *iov_prev = tcp4_l2_iov[tcp4_payload_used - 1];
+ const uint16_t *check = NULL;
+
+ if (no_csum) {
+ struct iphdr *iph = iov_prev[TCP_IOV_IP].iov_base;
+ check = &iph->check;
+ }
+
+ tcp4_seq_update[tcp4_payload_used].seq = seq_update;
+ tcp4_seq_update[tcp4_payload_used].len = dlen;
+
+ iov = tcp4_l2_iov[tcp4_payload_used++];
+ l4len = tcp_fill_headers4(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+ check, seq);
+ iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+ if (tcp4_payload_used > TCP_FRAMES_MEM - 1)
+ tcp_payload_flush(c);
+ } else if (CONN_V6(conn)) {
+ tcp6_seq_update[tcp6_payload_used].seq = seq_update;
+ tcp6_seq_update[tcp6_payload_used].len = dlen;
+
+ iov = tcp6_l2_iov[tcp6_payload_used++];
+ l4len = tcp_fill_headers6(c, conn, iov[TCP_IOV_TAP].iov_base,
+ iov[TCP_IOV_IP].iov_base,
+ iov[TCP_IOV_PAYLOAD].iov_base, dlen,
+ seq);
+ iov[TCP_IOV_PAYLOAD].iov_len = l4len;
+ if (tcp6_payload_used > TCP_FRAMES_MEM - 1)
+ tcp_payload_flush(c);
+ }
+}
+
+/**
+ * tcp_data_from_sock() - Handle new data from socket, queue to tap, in window
+ * @c: Execution context
+ * @conn: Connection pointer
+ *
+ * Return: negative on connection reset, 0 otherwise
+ *
+ * #syscalls recvmsg
+ */
+int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn)
+{
+ uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap;
+ int fill_bufs, send_bufs = 0, last_len, iov_rem = 0;
+ int sendlen, len, dlen, v4 = CONN_V4(conn);
+ int s = conn->sock, i, ret = 0;
+ struct msghdr mh_sock = { 0 };
+ uint16_t mss = MSS_GET(conn);
+ uint32_t already_sent, seq;
+ struct iovec *iov;
+
+ already_sent = conn->seq_to_tap - conn->seq_ack_from_tap;
+
+ if (SEQ_LT(already_sent, 0)) {
+ /* RFC 761, section 2.1. */
+ flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u",
+ conn->seq_ack_from_tap, conn->seq_to_tap);
+ conn->seq_to_tap = conn->seq_ack_from_tap;
+ already_sent = 0;
+ }
+
+ if (!wnd_scaled || already_sent >= wnd_scaled) {
+ conn_flag(c, conn, STALLED);
+ conn_flag(c, conn, ACK_FROM_TAP_DUE);
+ return 0;
+ }
+
+ /* Set up buffer descriptors we'll fill completely and partially. */
+ fill_bufs = DIV_ROUND_UP(wnd_scaled - already_sent, mss);
+ if (fill_bufs > TCP_FRAMES) {
+ fill_bufs = TCP_FRAMES;
+ iov_rem = 0;
+ } else {
+ iov_rem = (wnd_scaled - already_sent) % mss;
+ }
+
+ mh_sock.msg_iov = iov_sock;
+ mh_sock.msg_iovlen = fill_bufs + 1;
+
+ iov_sock[0].iov_base = tcp_buf_discard;
+ iov_sock[0].iov_len = already_sent;
+
+ if (( v4 && tcp4_payload_used + fill_bufs > TCP_FRAMES_MEM) ||
+ (!v4 && tcp6_payload_used + fill_bufs > TCP_FRAMES_MEM)) {
+ tcp_payload_flush(c);
+
+ /* Silence Coverity CWE-125 false positive */
+ tcp4_payload_used = tcp6_payload_used = 0;
+ }
+
+ for (i = 0, iov = iov_sock + 1; i < fill_bufs; i++, iov++) {
+ if (v4)
+ iov->iov_base = &tcp4_payload[tcp4_payload_used + i].data;
+ else
+ iov->iov_base = &tcp6_payload[tcp6_payload_used + i].data;
+ iov->iov_len = mss;
+ }
+ if (iov_rem)
+ iov_sock[fill_bufs].iov_len = iov_rem;
+
+ /* Receive into buffers, don't dequeue until acknowledged by guest. */
+ do
+ len = recvmsg(s, &mh_sock, MSG_PEEK);
+ while (len < 0 && errno == EINTR);
+
+ if (len < 0)
+ goto err;
+
+ if (!len) {
+ if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) {
+ if ((ret = tcp_send_flag(c, conn, FIN | ACK))) {
+ tcp_rst(c, conn);
+ return ret;
+ }
+
+ conn_event(c, conn, TAP_FIN_SENT);
+ }
+
+ return 0;
+ }
+
+ sendlen = len - already_sent;
+ if (sendlen <= 0) {
+ conn_flag(c, conn, STALLED);
+ return 0;
+ }
+
+ conn_flag(c, conn, ~STALLED);
+
+ send_bufs = DIV_ROUND_UP(sendlen, mss);
+ last_len = sendlen - (send_bufs - 1) * mss;
+
+ /* Likely, some new data was acked too. */
+ tcp_update_seqack_wnd(c, conn, 0, NULL);
+
+ /* Finally, queue to tap */
+ dlen = mss;
+ seq = conn->seq_to_tap;
+ for (i = 0; i < send_bufs; i++) {
+ int no_csum = i && i != send_bufs - 1 && tcp4_payload_used;
+
+ if (i == send_bufs - 1)
+ dlen = last_len;
+
+ tcp_data_to_tap(c, conn, dlen, no_csum, seq);
+ seq += dlen;
+ }
+
+ conn_flag(c, conn, ACK_FROM_TAP_DUE);
+
+ return 0;
+
+err:
+ if (errno != EAGAIN && errno != EWOULDBLOCK) {
+ ret = -errno;
+ tcp_rst(c, conn);
+ }
+
+ return ret;
+}
diff --git a/tcp_buf.h b/tcp_buf.h
new file mode 100644
index 000000000000..57ad4a2a289c
--- /dev/null
+++ b/tcp_buf.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef TCP_BUF_H
+#define TCP_BUF_H
+
+void tcp_sock4_iov_init(const struct ctx *c);
+void tcp_sock6_iov_init(const struct ctx *c);
+void tcp_flags_flush(const struct ctx *c);
+void tcp_payload_flush(const struct ctx *c);
+uint16_t tcp_buf_conn_tap_mss(const struct tcp_tap_conn *conn);
+int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn);
+int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags);
+
+#endif /*TCP_BUF_H */
diff --git a/tcp_internal.h b/tcp_internal.h
new file mode 100644
index 000000000000..6155c7c23afb
--- /dev/null
+++ b/tcp_internal.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef TCP_INTERNAL_H
+#define TCP_INTERNAL_H
+
+#define MAX_WS 8
+#define MAX_WINDOW (1 << (16 + (MAX_WS)))
+
+#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
+#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
+#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
+#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
+
+#define FIN (1 << 0)
+#define SYN (1 << 1)
+#define RST (1 << 2)
+#define ACK (1 << 4)
+
+/* Flags for internal usage */
+#define DUP_ACK (1 << 5)
+#define OPT_EOL 0
+#define OPT_NOP 1
+#define OPT_MSS 2
+#define OPT_MSS_LEN 4
+#define OPT_WS 3
+#define OPT_WS_LEN 3
+#define OPT_SACKP 4
+#define OPT_SACK 5
+#define OPT_TS 8
+
+#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
+#define CONN_V6(conn) (!CONN_V4(conn))
+
+extern char tcp_buf_discard [MAX_WINDOW];
+
+void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long flag);
+#define conn_flag(c, conn, flag) \
+ do { \
+ flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
+ conn_flag_do(c, conn, flag); \
+ } while (0)
+
+
+void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long event);
+#define conn_event(c, conn, event) \
+ do { \
+ flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
+ conn_event_do(c, conn, event); \
+ } while (0)
+
+void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
+#define tcp_rst(c, conn) \
+ do { \
+ flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
+ tcp_rst_do(c, conn); \
+ } while (0)
+
+size_t tcp_fill_headers4(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
+ struct tap_hdr *taph,
+ struct iphdr *iph, struct tcphdr *th,
+ size_t dlen, const uint16_t *check,
+ uint32_t seq);
+size_t tcp_fill_headers6(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
+ struct tap_hdr *taph,
+ struct ipv6hdr *ip6h, struct tcphdr *th,
+ size_t dlen, uint32_t seq);
+int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
+ int force_seq, struct tcp_info *tinfo);
+int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags,
+ struct tcphdr *th, char *data, size_t *optlen);
+
+#endif /* TCP_INTERNAL_H */
--
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2021 Red Hat GmbH
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ */
+
+#ifndef TCP_INTERNAL_H
+#define TCP_INTERNAL_H
+
+#define MAX_WS 8
+#define MAX_WINDOW (1 << (16 + (MAX_WS)))
+
+#define SEQ_LE(a, b) ((b) - (a) < MAX_WINDOW)
+#define SEQ_LT(a, b) ((b) - (a) - 1 < MAX_WINDOW)
+#define SEQ_GE(a, b) ((a) - (b) < MAX_WINDOW)
+#define SEQ_GT(a, b) ((a) - (b) - 1 < MAX_WINDOW)
+
+#define FIN (1 << 0)
+#define SYN (1 << 1)
+#define RST (1 << 2)
+#define ACK (1 << 4)
+
+/* Flags for internal usage */
+#define DUP_ACK (1 << 5)
+#define OPT_EOL 0
+#define OPT_NOP 1
+#define OPT_MSS 2
+#define OPT_MSS_LEN 4
+#define OPT_WS 3
+#define OPT_WS_LEN 3
+#define OPT_SACKP 4
+#define OPT_SACK 5
+#define OPT_TS 8
+
+#define CONN_V4(conn) (!!inany_v4(&(conn)->faddr))
+#define CONN_V6(conn) (!CONN_V4(conn))
+
+extern char tcp_buf_discard [MAX_WINDOW];
+
+void conn_flag_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long flag);
+#define conn_flag(c, conn, flag) \
+ do { \
+ flow_trace(conn, "flag at %s:%i", __func__, __LINE__); \
+ conn_flag_do(c, conn, flag); \
+ } while (0)
+
+
+void conn_event_do(const struct ctx *c, struct tcp_tap_conn *conn,
+ unsigned long event);
+#define conn_event(c, conn, event) \
+ do { \
+ flow_trace(conn, "event at %s:%i", __func__, __LINE__); \
+ conn_event_do(c, conn, event); \
+ } while (0)
+
+void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn);
+#define tcp_rst(c, conn) \
+ do { \
+ flow_dbg((conn), "TCP reset at %s:%i", __func__, __LINE__); \
+ tcp_rst_do(c, conn); \
+ } while (0)
+
+size_t tcp_fill_headers4(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
+ struct tap_hdr *taph,
+ struct iphdr *iph, struct tcphdr *th,
+ size_t dlen, const uint16_t *check,
+ uint32_t seq);
+size_t tcp_fill_headers6(const struct ctx *c,
+ const struct tcp_tap_conn *conn,
+ struct tap_hdr *taph,
+ struct ipv6hdr *ip6h, struct tcphdr *th,
+ size_t dlen, uint32_t seq);
+int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn,
+ int force_seq, struct tcp_info *tinfo);
+int tcp_fill_flag_header(struct ctx *c, struct tcp_tap_conn *conn, int flags,
+ struct tcphdr *th, char *data, size_t *optlen);
+
+#endif /* TCP_INTERNAL_H */
--
2.44.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 5/8] tap: export pool_flush()/tapX_handler()/packet_add()
2024-05-27 9:10 [PATCH 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
` (3 preceding siblings ...)
2024-05-27 9:10 ` [PATCH 4/8] tcp: move buffers management functions to their own file Laurent Vivier
@ 2024-05-27 9:10 ` Laurent Vivier
2024-05-27 9:10 ` [PATCH 6/8] udp: move udpX_l2_buf_t and udpX_l2_mh_sock out of udp_update_hdrX() Laurent Vivier
` (2 subsequent siblings)
7 siblings, 0 replies; 9+ messages in thread
From: Laurent Vivier @ 2024-05-27 9:10 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
tap.c | 97 +++++++++++++++++++++++++++++------------------------------
tap.h | 7 +++++
2 files changed, 55 insertions(+), 49 deletions(-)
diff --git a/tap.c b/tap.c
index 91fd2e2ba785..027fb28abd68 100644
--- a/tap.c
+++ b/tap.c
@@ -920,6 +920,45 @@ append:
return in->count;
}
+void pool_flush_all(void)
+{
+ pool_flush(pool_tap4);
+ pool_flush(pool_tap6);
+}
+
+void tap_handler_all(struct ctx *c, const struct timespec *now)
+{
+ tap4_handler(c, pool_tap4, now);
+ tap6_handler(c, pool_tap6, now);
+}
+
+void packet_add_all_do(struct ctx *c, ssize_t l2len, char *p,
+ const char *func, int line)
+{
+ const struct ethhdr *eh;
+
+ pcap(p, l2len);
+
+ eh = (struct ethhdr *)p;
+
+ if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
+ memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
+ proto_update_l2_buf(c->mac_guest, NULL);
+ }
+
+ switch (ntohs(eh->h_proto)) {
+ case ETH_P_ARP:
+ case ETH_P_IP:
+ packet_add_do(pool_tap4, l2len, p, func, line);
+ break;
+ case ETH_P_IPV6:
+ packet_add_do(pool_tap6, l2len, p, func, line);
+ break;
+ default:
+ break;
+ }
+}
+
/**
* tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket
* @c: Execution context
@@ -946,7 +985,6 @@ static void tap_sock_reset(struct ctx *c)
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now)
{
- const struct ethhdr *eh;
ssize_t n, rem;
char *p;
@@ -959,8 +997,7 @@ redo:
p = pkt_buf;
rem = 0;
- pool_flush(pool_tap4);
- pool_flush(pool_tap6);
+ pool_flush_all();
n = recv(c->fd_tap, p, TAP_BUF_FILL, MSG_DONTWAIT);
if (n < 0) {
@@ -987,38 +1024,18 @@ redo:
/* Complete the partial read above before discarding a malformed
* frame, otherwise the stream will be inconsistent.
*/
- if (l2len < (ssize_t)sizeof(*eh) ||
+ if (l2len < (ssize_t)sizeof(struct ethhdr) ||
l2len > (ssize_t)ETH_MAX_MTU)
goto next;
- pcap(p, l2len);
-
- eh = (struct ethhdr *)p;
-
- if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
- memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
- proto_update_l2_buf(c->mac_guest, NULL);
- }
-
- switch (ntohs(eh->h_proto)) {
- case ETH_P_ARP:
- case ETH_P_IP:
- packet_add(pool_tap4, l2len, p);
- break;
- case ETH_P_IPV6:
- packet_add(pool_tap6, l2len, p);
- break;
- default:
- break;
- }
+ packet_add_all(c, l2len, p);
next:
p += l2len;
n -= l2len;
}
- tap4_handler(c, pool_tap4, now);
- tap6_handler(c, pool_tap6, now);
+ tap_handler_all(c, now);
/* We can't use EPOLLET otherwise. */
if (rem)
@@ -1043,35 +1060,18 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
redo:
n = 0;
- pool_flush(pool_tap4);
- pool_flush(pool_tap6);
+ pool_flush_all();
restart:
while ((len = read(c->fd_tap, pkt_buf + n, TAP_BUF_BYTES - n)) > 0) {
- const struct ethhdr *eh = (struct ethhdr *)(pkt_buf + n);
- if (len < (ssize_t)sizeof(*eh) || len > (ssize_t)ETH_MAX_MTU) {
+ if (len < (ssize_t)sizeof(struct ethhdr) ||
+ len > (ssize_t)ETH_MAX_MTU) {
n += len;
continue;
}
- pcap(pkt_buf + n, len);
- if (memcmp(c->mac_guest, eh->h_source, ETH_ALEN)) {
- memcpy(c->mac_guest, eh->h_source, ETH_ALEN);
- proto_update_l2_buf(c->mac_guest, NULL);
- }
-
- switch (ntohs(eh->h_proto)) {
- case ETH_P_ARP:
- case ETH_P_IP:
- packet_add(pool_tap4, len, pkt_buf + n);
- break;
- case ETH_P_IPV6:
- packet_add(pool_tap6, len, pkt_buf + n);
- break;
- default:
- break;
- }
+ packet_add_all(c, len, pkt_buf + n);
if ((n += len) == TAP_BUF_BYTES)
break;
@@ -1082,8 +1082,7 @@ restart:
ret = errno;
- tap4_handler(c, pool_tap4, now);
- tap6_handler(c, pool_tap6, now);
+ tap_handler_all(c, now);
if (len > 0 || ret == EAGAIN)
return;
diff --git a/tap.h b/tap.h
index d146d2f115f2..7bff945483de 100644
--- a/tap.h
+++ b/tap.h
@@ -69,5 +69,12 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now);
void tap_sock_init(struct ctx *c);
+void pool_flush_all(void);
+void tap_handler_all(struct ctx *c, const struct timespec *now);
+
+void packet_add_all_do(struct ctx *c, ssize_t l2len, char *p,
+ const char *func, int line);
+#define packet_add_all(p, l2len, start) \
+ packet_add_all_do(p, l2len, start, __func__, __LINE__)
#endif /* TAP_H */
--
@@ -69,5 +69,12 @@ void tap_handler_pasta(struct ctx *c, uint32_t events,
void tap_handler_passt(struct ctx *c, uint32_t events,
const struct timespec *now);
void tap_sock_init(struct ctx *c);
+void pool_flush_all(void);
+void tap_handler_all(struct ctx *c, const struct timespec *now);
+
+void packet_add_all_do(struct ctx *c, ssize_t l2len, char *p,
+ const char *func, int line);
+#define packet_add_all(p, l2len, start) \
+ packet_add_all_do(p, l2len, start, __func__, __LINE__)
#endif /* TAP_H */
--
2.44.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 6/8] udp: move udpX_l2_buf_t and udpX_l2_mh_sock out of udp_update_hdrX()
2024-05-27 9:10 [PATCH 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
` (4 preceding siblings ...)
2024-05-27 9:10 ` [PATCH 5/8] tap: export pool_flush()/tapX_handler()/packet_add() Laurent Vivier
@ 2024-05-27 9:10 ` Laurent Vivier
2024-05-27 9:10 ` [PATCH 7/8] udp: rename udp_sock_handler() to udp_buf_sock_handler() Laurent Vivier
2024-05-27 9:10 ` [PATCH 8/8] vhost-user: compare mode MODE_PASTA and not MODE_PASST Laurent Vivier
7 siblings, 0 replies; 9+ messages in thread
From: Laurent Vivier @ 2024-05-27 9:10 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
udp.c | 64 +++++++++++++++++++++++++++++++++--------------------------
1 file changed, 36 insertions(+), 28 deletions(-)
diff --git a/udp.c b/udp.c
index 3abafc994537..39c7c4e0a584 100644
--- a/udp.c
+++ b/udp.c
@@ -556,7 +556,9 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
/**
* udp_update_hdr4() - Update headers for one IPv4 datagram
* @c: Execution context
- * @bm: Pointer to udp_meta_t to update
+ * @taph: Tap backend specific header
+ * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
+ * @s_in: Source socket address, filled in by recvmmsg()
* @bp: Pointer to udp_payload_t to update
* @dstport: Destination port number
* @dlen: Length of UDP payload
@@ -564,16 +566,17 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
*
* Return: size of IPv4 payload (UDP header + data)
*/
-static size_t udp_update_hdr4(const struct ctx *c,
- struct udp_meta_t *bm, struct udp_payload_t *bp,
+static size_t udp_update_hdr4(const struct ctx *c, struct tap_hdr *taph,
+ struct iphdr *ip4h, const struct sockaddr_in *s_in,
+ struct udp_payload_t *bp,
in_port_t dstport, size_t dlen,
const struct timespec *now)
{
- in_port_t srcport = ntohs(bm->s_in.sa4.sin_port);
+ in_port_t srcport = ntohs(s_in->sin_port);
const struct in_addr dst = c->ip4.addr_seen;
- struct in_addr src = bm->s_in.sa4.sin_addr;
+ struct in_addr src = s_in->sin_addr;
size_t l4len = dlen + sizeof(bp->uh);
- size_t l3len = l4len + sizeof(bm->ip4h);
+ size_t l3len = l4len + sizeof(*ip4h);
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) &&
IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 &&
@@ -594,24 +597,26 @@ static size_t udp_update_hdr4(const struct ctx *c,
src = c->ip4.gw;
}
- bm->ip4h.tot_len = htons(l3len);
- bm->ip4h.daddr = dst.s_addr;
- bm->ip4h.saddr = src.s_addr;
- bm->ip4h.check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst);
+ ip4h->tot_len = htons(l3len);
+ ip4h->daddr = dst.s_addr;
+ ip4h->saddr = src.s_addr;
+ ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst);
- bp->uh.source = bm->s_in.sa4.sin_port;
+ bp->uh.source = s_in->sin_port;
bp->uh.dest = htons(dstport);
bp->uh.len = htons(l4len);
csum_udp4(&bp->uh, src, dst, bp->data, dlen);
- tap_hdr_update(&bm->taph, l3len + sizeof(udp4_eth_hdr));
+ tap_hdr_update(taph, l3len + sizeof(udp4_eth_hdr));
return l4len;
}
/**
* udp_update_hdr6() - Update headers for one IPv6 datagram
* @c: Execution context
- * @bm: Pointer to udp_meta_t to update
+ * @taph: Tap backend specific header
+ * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
+ * @s_in: Source socket address, filled in by recvmmsg()
* @bp: Pointer to udp_payload_t to update
* @dstport: Destination port number
* @dlen: Length of UDP payload
@@ -619,14 +624,15 @@ static size_t udp_update_hdr4(const struct ctx *c,
*
* Return: size of IPv6 payload (UDP header + data)
*/
-static size_t udp_update_hdr6(const struct ctx *c,
- struct udp_meta_t *bm, struct udp_payload_t *bp,
+static size_t udp_update_hdr6(const struct ctx *c, struct tap_hdr *taph,
+ struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6,
+ struct udp_payload_t *bp,
in_port_t dstport, size_t dlen,
const struct timespec *now)
{
- const struct in6_addr *src = &bm->s_in.sa6.sin6_addr;
+ const struct in6_addr *src = &s_in6->sin6_addr;
const struct in6_addr *dst = &c->ip6.addr_seen;
- in_port_t srcport = ntohs(bm->s_in.sa6.sin6_port);
+ in_port_t srcport = ntohs(s_in6->sin6_port);
uint16_t l4len = dlen + sizeof(bp->uh);
if (IN6_IS_ADDR_LINKLOCAL(src)) {
@@ -663,19 +669,19 @@ static size_t udp_update_hdr6(const struct ctx *c,
}
- bm->ip6h.payload_len = htons(l4len);
- bm->ip6h.daddr = *dst;
- bm->ip6h.saddr = *src;
- bm->ip6h.version = 6;
- bm->ip6h.nexthdr = IPPROTO_UDP;
- bm->ip6h.hop_limit = 255;
+ ip6h->payload_len = htons(l4len);
+ ip6h->daddr = *dst;
+ ip6h->saddr = *src;
+ ip6h->version = 6;
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 255;
- bp->uh.source = bm->s_in.sa6.sin6_port;
+ bp->uh.source = s_in6->sin6_port;
bp->uh.dest = htons(dstport);
- bp->uh.len = bm->ip6h.payload_len;
+ bp->uh.len = ip6h->payload_len;
csum_udp6(&bp->uh, src, dst, bp->data, dlen);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr));
+ tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(udp6_eth_hdr));
return l4len;
}
@@ -708,10 +714,12 @@ static void udp_tap_send(const struct ctx *c,
size_t l4len;
if (v6) {
- l4len = udp_update_hdr6(c, bm, bp, dstport,
+ l4len = udp_update_hdr6(c, &bm->taph, &bm->ip6h,
+ &bm->s_in.sa6, bp, dstport,
udp6_l2_mh_sock[i].msg_len, now);
} else {
- l4len = udp_update_hdr4(c, bm, bp, dstport,
+ l4len = udp_update_hdr4(c, &bm->taph, &bm->ip4h,
+ &bm->s_in.sa4, bp, dstport,
udp4_l2_mh_sock[i].msg_len, now);
}
tap_iov[i][UDP_IOV_PAYLOAD].iov_len = l4len;
--
@@ -556,7 +556,9 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
/**
* udp_update_hdr4() - Update headers for one IPv4 datagram
* @c: Execution context
- * @bm: Pointer to udp_meta_t to update
+ * @taph: Tap backend specific header
+ * @ip4h: Pre-filled IPv4 header (except for tot_len and saddr)
+ * @s_in: Source socket address, filled in by recvmmsg()
* @bp: Pointer to udp_payload_t to update
* @dstport: Destination port number
* @dlen: Length of UDP payload
@@ -564,16 +566,17 @@ static void udp_splice_sendfrom(const struct ctx *c, unsigned start, unsigned n,
*
* Return: size of IPv4 payload (UDP header + data)
*/
-static size_t udp_update_hdr4(const struct ctx *c,
- struct udp_meta_t *bm, struct udp_payload_t *bp,
+static size_t udp_update_hdr4(const struct ctx *c, struct tap_hdr *taph,
+ struct iphdr *ip4h, const struct sockaddr_in *s_in,
+ struct udp_payload_t *bp,
in_port_t dstport, size_t dlen,
const struct timespec *now)
{
- in_port_t srcport = ntohs(bm->s_in.sa4.sin_port);
+ in_port_t srcport = ntohs(s_in->sin_port);
const struct in_addr dst = c->ip4.addr_seen;
- struct in_addr src = bm->s_in.sa4.sin_addr;
+ struct in_addr src = s_in->sin_addr;
size_t l4len = dlen + sizeof(bp->uh);
- size_t l3len = l4len + sizeof(bm->ip4h);
+ size_t l3len = l4len + sizeof(*ip4h);
if (!IN4_IS_ADDR_UNSPECIFIED(&c->ip4.dns_match) &&
IN4_ARE_ADDR_EQUAL(&src, &c->ip4.dns_host) && srcport == 53 &&
@@ -594,24 +597,26 @@ static size_t udp_update_hdr4(const struct ctx *c,
src = c->ip4.gw;
}
- bm->ip4h.tot_len = htons(l3len);
- bm->ip4h.daddr = dst.s_addr;
- bm->ip4h.saddr = src.s_addr;
- bm->ip4h.check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst);
+ ip4h->tot_len = htons(l3len);
+ ip4h->daddr = dst.s_addr;
+ ip4h->saddr = src.s_addr;
+ ip4h->check = csum_ip4_header(l3len, IPPROTO_UDP, src, dst);
- bp->uh.source = bm->s_in.sa4.sin_port;
+ bp->uh.source = s_in->sin_port;
bp->uh.dest = htons(dstport);
bp->uh.len = htons(l4len);
csum_udp4(&bp->uh, src, dst, bp->data, dlen);
- tap_hdr_update(&bm->taph, l3len + sizeof(udp4_eth_hdr));
+ tap_hdr_update(taph, l3len + sizeof(udp4_eth_hdr));
return l4len;
}
/**
* udp_update_hdr6() - Update headers for one IPv6 datagram
* @c: Execution context
- * @bm: Pointer to udp_meta_t to update
+ * @taph: Tap backend specific header
+ * @ip6h: Pre-filled IPv6 header (except for payload_len and addresses)
+ * @s_in: Source socket address, filled in by recvmmsg()
* @bp: Pointer to udp_payload_t to update
* @dstport: Destination port number
* @dlen: Length of UDP payload
@@ -619,14 +624,15 @@ static size_t udp_update_hdr4(const struct ctx *c,
*
* Return: size of IPv6 payload (UDP header + data)
*/
-static size_t udp_update_hdr6(const struct ctx *c,
- struct udp_meta_t *bm, struct udp_payload_t *bp,
+static size_t udp_update_hdr6(const struct ctx *c, struct tap_hdr *taph,
+ struct ipv6hdr *ip6h, struct sockaddr_in6 *s_in6,
+ struct udp_payload_t *bp,
in_port_t dstport, size_t dlen,
const struct timespec *now)
{
- const struct in6_addr *src = &bm->s_in.sa6.sin6_addr;
+ const struct in6_addr *src = &s_in6->sin6_addr;
const struct in6_addr *dst = &c->ip6.addr_seen;
- in_port_t srcport = ntohs(bm->s_in.sa6.sin6_port);
+ in_port_t srcport = ntohs(s_in6->sin6_port);
uint16_t l4len = dlen + sizeof(bp->uh);
if (IN6_IS_ADDR_LINKLOCAL(src)) {
@@ -663,19 +669,19 @@ static size_t udp_update_hdr6(const struct ctx *c,
}
- bm->ip6h.payload_len = htons(l4len);
- bm->ip6h.daddr = *dst;
- bm->ip6h.saddr = *src;
- bm->ip6h.version = 6;
- bm->ip6h.nexthdr = IPPROTO_UDP;
- bm->ip6h.hop_limit = 255;
+ ip6h->payload_len = htons(l4len);
+ ip6h->daddr = *dst;
+ ip6h->saddr = *src;
+ ip6h->version = 6;
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 255;
- bp->uh.source = bm->s_in.sa6.sin6_port;
+ bp->uh.source = s_in6->sin6_port;
bp->uh.dest = htons(dstport);
- bp->uh.len = bm->ip6h.payload_len;
+ bp->uh.len = ip6h->payload_len;
csum_udp6(&bp->uh, src, dst, bp->data, dlen);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + sizeof(udp6_eth_hdr));
+ tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(udp6_eth_hdr));
return l4len;
}
@@ -708,10 +714,12 @@ static void udp_tap_send(const struct ctx *c,
size_t l4len;
if (v6) {
- l4len = udp_update_hdr6(c, bm, bp, dstport,
+ l4len = udp_update_hdr6(c, &bm->taph, &bm->ip6h,
+ &bm->s_in.sa6, bp, dstport,
udp6_l2_mh_sock[i].msg_len, now);
} else {
- l4len = udp_update_hdr4(c, bm, bp, dstport,
+ l4len = udp_update_hdr4(c, &bm->taph, &bm->ip4h,
+ &bm->s_in.sa4, bp, dstport,
udp4_l2_mh_sock[i].msg_len, now);
}
tap_iov[i][UDP_IOV_PAYLOAD].iov_len = l4len;
--
2.44.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 7/8] udp: rename udp_sock_handler() to udp_buf_sock_handler()
2024-05-27 9:10 [PATCH 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
` (5 preceding siblings ...)
2024-05-27 9:10 ` [PATCH 6/8] udp: move udpX_l2_buf_t and udpX_l2_mh_sock out of udp_update_hdrX() Laurent Vivier
@ 2024-05-27 9:10 ` Laurent Vivier
2024-05-27 9:10 ` [PATCH 8/8] vhost-user: compare mode MODE_PASTA and not MODE_PASST Laurent Vivier
7 siblings, 0 replies; 9+ messages in thread
From: Laurent Vivier @ 2024-05-27 9:10 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
We are going to introduce a variant of the function to use
vhost-user buffers rather than passt internal buffers.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
passt.c | 2 +-
udp.c | 6 +++---
udp.h | 2 +-
3 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/passt.c b/passt.c
index 771b8a74e90a..cae8df74a529 100644
--- a/passt.c
+++ b/passt.c
@@ -374,7 +374,7 @@ loop:
tcp_timer_handler(&c, ref);
break;
case EPOLL_TYPE_UDP:
- udp_sock_handler(&c, ref, eventmask, &now);
+ udp_buf_sock_handler(&c, ref, eventmask, &now);
break;
case EPOLL_TYPE_PING:
icmp_sock_handler(&c, ref);
diff --git a/udp.c b/udp.c
index 39c7c4e0a584..d6a545a5fca3 100644
--- a/udp.c
+++ b/udp.c
@@ -729,7 +729,7 @@ static void udp_tap_send(const struct ctx *c,
}
/**
- * udp_sock_handler() - Handle new data from socket
+ * udp_buf_sock_handler() - Handle new data from socket
* @c: Execution context
* @ref: epoll reference
* @events: epoll events bitmap
@@ -737,8 +737,8 @@ static void udp_tap_send(const struct ctx *c,
*
* #syscalls recvmmsg
*/
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
- const struct timespec *now)
+void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+ const struct timespec *now)
{
/* For not entirely clear reasons (data locality?) pasta gets
* better throughput if we receive tap datagrams one at a
diff --git a/udp.h b/udp.h
index 9976b6231f1c..5865def20856 100644
--- a/udp.h
+++ b/udp.h
@@ -9,7 +9,7 @@
#define UDP_TIMER_INTERVAL 1000 /* ms */
void udp_portmap_clear(void);
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now);
int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
--
@@ -9,7 +9,7 @@
#define UDP_TIMER_INTERVAL 1000 /* ms */
void udp_portmap_clear(void);
-void udp_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
+void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t events,
const struct timespec *now);
int udp_tap_handler(struct ctx *c, uint8_t pif, sa_family_t af,
const void *saddr, const void *daddr,
--
2.44.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 8/8] vhost-user: compare mode MODE_PASTA and not MODE_PASST
2024-05-27 9:10 [PATCH 0/8] Add vhost-user support to passt (part 2) Laurent Vivier
` (6 preceding siblings ...)
2024-05-27 9:10 ` [PATCH 7/8] udp: rename udp_sock_handler() to udp_buf_sock_handler() Laurent Vivier
@ 2024-05-27 9:10 ` Laurent Vivier
7 siblings, 0 replies; 9+ messages in thread
From: Laurent Vivier @ 2024-05-27 9:10 UTC (permalink / raw)
To: passt-dev; +Cc: Laurent Vivier
As we are going to introduce the MODE_VU that will act like
the mode MODE_PASST, compare to MODE_PASTA rather than to add
a comparison to MODE_VU when we check for MODE_PASST.
Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
conf.c | 12 ++++++------
isolation.c | 10 +++++-----
passt.c | 2 +-
tap.c | 12 ++++++------
tcp_buf.c | 2 +-
udp.c | 2 +-
6 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/conf.c b/conf.c
index 21d46fe74d22..a0ee651e7027 100644
--- a/conf.c
+++ b/conf.c
@@ -146,7 +146,7 @@ static void conf_ports(const struct ctx *c, char optname, const char *optarg,
if (fwd->mode)
goto mode_conflict;
- if (c->mode != MODE_PASST)
+ if (c->mode == MODE_PASTA)
die("'all' port forwarding is only allowed for passt");
fwd->mode = FWD_ALL;
@@ -1248,7 +1248,7 @@ void conf(struct ctx *c, int argc, char **argv)
c->no_dhcp_dns = 0;
break;
case 6:
- if (c->mode != MODE_PASST)
+ if (c->mode == MODE_PASTA)
die("--no-dhcp-dns is for passt mode only");
c->no_dhcp_dns = 1;
@@ -1260,7 +1260,7 @@ void conf(struct ctx *c, int argc, char **argv)
c->no_dhcp_dns_search = 0;
break;
case 8:
- if (c->mode != MODE_PASST)
+ if (c->mode == MODE_PASTA)
die("--no-dhcp-search is for passt mode only");
c->no_dhcp_dns_search = 1;
@@ -1315,7 +1315,7 @@ void conf(struct ctx *c, int argc, char **argv)
break;
case 14:
fprintf(stdout,
- c->mode == MODE_PASST ? "passt " : "pasta ");
+ c->mode == MODE_PASTA ? "pasta " : "passt ");
fprintf(stdout, VERSION_BLOB);
exit(EXIT_SUCCESS);
case 15:
@@ -1618,7 +1618,7 @@ void conf(struct ctx *c, int argc, char **argv)
v6_only = true;
break;
case '1':
- if (c->mode != MODE_PASST)
+ if (c->mode == MODE_PASTA)
die("--one-off is for passt mode only");
if (c->one_off)
@@ -1665,7 +1665,7 @@ void conf(struct ctx *c, int argc, char **argv)
conf_ugid(runas, &uid, &gid);
if (logfile) {
- logfile_init(c->mode == MODE_PASST ? "passt" : "pasta",
+ logfile_init(c->mode == MODE_PASTA ? "pasta" : "passt",
logfile, logsize);
}
diff --git a/isolation.c b/isolation.c
index f394e93b8526..ca2c68b52ec7 100644
--- a/isolation.c
+++ b/isolation.c
@@ -312,7 +312,7 @@ int isolate_prefork(const struct ctx *c)
* PID namespace. For passt, use CLONE_NEWPID anyway, in case somebody
* ever gets around seccomp profiles -- there's no harm in passing it.
*/
- if (!c->foreground || c->mode == MODE_PASST)
+ if (!c->foreground || c->mode != MODE_PASTA)
flags |= CLONE_NEWPID;
if (unshare(flags)) {
@@ -379,12 +379,12 @@ void isolate_postfork(const struct ctx *c)
prctl(PR_SET_DUMPABLE, 0);
- if (c->mode == MODE_PASST) {
- prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
- prog.filter = filter_passt;
- } else {
+ if (c->mode == MODE_PASTA) {
prog.len = (unsigned short)ARRAY_SIZE(filter_pasta);
prog.filter = filter_pasta;
+ } else {
+ prog.len = (unsigned short)ARRAY_SIZE(filter_passt);
+ prog.filter = filter_passt;
}
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||
diff --git a/passt.c b/passt.c
index cae8df74a529..42343db471b8 100644
--- a/passt.c
+++ b/passt.c
@@ -342,7 +342,7 @@ loop:
uint32_t eventmask = events[i].events;
trace("%s: epoll event on %s %i (events: 0x%08x)",
- c.mode == MODE_PASST ? "passt" : "pasta",
+ c.mode == MODE_PASTA ? "pasta" : "passt",
EPOLL_TYPE_STR(ref.type), ref.fd, eventmask);
switch (ref.type) {
diff --git a/tap.c b/tap.c
index 027fb28abd68..a0f3201f3f38 100644
--- a/tap.c
+++ b/tap.c
@@ -416,10 +416,10 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
if (!nframes)
return 0;
- if (c->mode == MODE_PASST)
- m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
- else
+ if (c->mode == MODE_PASTA)
m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes);
+ else
+ m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes);
if (m < nframes)
debug("tap: failed to send %zu frames of %zu",
@@ -1308,10 +1308,10 @@ void tap_sock_init(struct ctx *c)
return;
}
- if (c->mode == MODE_PASST) {
+ if (c->mode == MODE_PASTA) {
+ tap_sock_tun_init(c);
+ } else {
if (c->fd_tap_listen == -1)
tap_sock_unix_init(c);
- } else {
- tap_sock_tun_init(c);
}
}
diff --git a/tcp_buf.c b/tcp_buf.c
index ea1e72875ec5..8a783c6fd81a 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -35,7 +35,7 @@
#define TCP_FRAMES_MEM 128
#define TCP_FRAMES \
- (c->mode == MODE_PASST ? TCP_FRAMES_MEM : 1)
+ (c->mode == MODE_PASTA ? 1 : TCP_FRAMES_MEM)
#define MSS4 ROUND_DOWN(IP_MAX_MTU - \
sizeof(struct tcphdr) - \
diff --git a/udp.c b/udp.c
index d6a545a5fca3..a767dd3c6de0 100644
--- a/udp.c
+++ b/udp.c
@@ -748,7 +748,7 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve
* whether we'll use tap or splice, always go one at a time
* for pasta mode.
*/
- ssize_t n = (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1);
+ ssize_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
in_port_t dstport = ref.udp.port;
bool v6 = ref.udp.v6;
struct mmsghdr *mmh_recv;
--
@@ -748,7 +748,7 @@ void udp_buf_sock_handler(const struct ctx *c, union epoll_ref ref, uint32_t eve
* whether we'll use tap or splice, always go one at a time
* for pasta mode.
*/
- ssize_t n = (c->mode == MODE_PASST ? UDP_MAX_FRAMES : 1);
+ ssize_t n = (c->mode == MODE_PASTA ? 1 : UDP_MAX_FRAMES);
in_port_t dstport = ref.udp.port;
bool v6 = ref.udp.v6;
struct mmsghdr *mmh_recv;
--
2.44.0
^ permalink raw reply related [flat|nested] 9+ messages in thread