public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: Stefano Brivio <sbrivio@redhat.com>, passt-dev@passt.top
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH 1/3] icmp: Store ping socket information in flow table
Date: Thu, 29 Feb 2024 15:15:32 +1100	[thread overview]
Message-ID: <20240229041534.2573559-2-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20240229041534.2573559-1-david@gibson.dropbear.id.au>

Currently icmp_id_map[][] stores information about ping sockets in a
bespoke structure.  Move the same information into new types of flow
in the flow table.  To match that change, replace the existing ICMP
timer with a flow-based timer for expiring ping sockets.  This has the
advantage that we only need to scan the active flows, not all possible
ids.

We convert icmp_id_map[][] to point to the flow table entries, rather
than containing its own information.  We do still use that array for
locating the right ping flows, rather than using a "flow native" form
of lookup for the time being.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 Makefile     |   6 +--
 flow.c       |   9 ++++
 flow.h       |   4 ++
 flow_table.h |   2 +
 icmp.c       | 143 +++++++++++++++++++++++----------------------------
 icmp.h       |   2 +-
 icmp_flow.h  |  31 +++++++++++
 passt.c      |   5 --
 8 files changed, 115 insertions(+), 87 deletions(-)
 create mode 100644 icmp_flow.h

diff --git a/Makefile b/Makefile
index 2d6a5155..47fc5448 100644
--- a/Makefile
+++ b/Makefile
@@ -54,9 +54,9 @@ SRCS = $(PASST_SRCS) $(QRAP_SRCS)
 MANPAGES = passt.1 pasta.1 qrap.1
 
 PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \
-	flow_table.h icmp.h inany.h iov.h isolation.h lineread.h log.h ndp.h \
-	netlink.h packet.h passt.h pasta.h pcap.h pif.h siphash.h tap.h tcp.h \
-	tcp_conn.h tcp_splice.h udp.h util.h
+	flow_table.h icmp.h icmp_flow.h inany.h iov.h isolation.h lineread.h \
+	log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h siphash.h \
+	tap.h tcp.h tcp_conn.h tcp_splice.h udp.h util.h
 HEADERS = $(PASST_HEADERS) seccomp.h
 
 C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 };
diff --git a/flow.c b/flow.c
index d7974d59..5835d6c0 100644
--- a/flow.c
+++ b/flow.c
@@ -21,6 +21,8 @@ const char *flow_type_str[] = {
 	[FLOW_TYPE_NONE]	= "<none>",
 	[FLOW_TCP]		= "TCP connection",
 	[FLOW_TCP_SPLICE]	= "TCP connection (spliced)",
+	[FLOW_PING4]		= "ICMP ping sequence",
+	[FLOW_PING6]		= "ICMPv6 ping sequence",
 };
 static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
 	      "flow_type_str[] doesn't match enum flow_type");
@@ -28,6 +30,8 @@ static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES,
 const uint8_t flow_proto[] = {
 	[FLOW_TCP]		= IPPROTO_TCP,
 	[FLOW_TCP_SPLICE]	= IPPROTO_TCP,
+	[FLOW_PING4]		= IPPROTO_ICMP,
+	[FLOW_PING6]		= IPPROTO_ICMPV6,
 };
 static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES,
 	      "flow_proto[] doesn't match enum flow_type");
@@ -294,6 +298,11 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now)
 			if (!closed && timer)
 				tcp_splice_timer(c, flow);
 			break;
+		case FLOW_PING4:
+		case FLOW_PING6:
+			if (timer)
+				closed = icmp_ping_timer(c, flow, now);
+			break;
 		default:
 			/* Assume other flow types don't need any handling */
 			;
diff --git a/flow.h b/flow.h
index 8b66751b..c943c441 100644
--- a/flow.h
+++ b/flow.h
@@ -19,6 +19,10 @@ enum flow_type {
 	FLOW_TCP,
 	/* A TCP connection between a host socket and ns socket */
 	FLOW_TCP_SPLICE,
+	/* ICMP echo requests from guest to host and matching replies back */
+	FLOW_PING4,
+	/* ICMPv6 echo requests from guest to host and matching replies back */
+	FLOW_PING6,
 
 	FLOW_NUM_TYPES,
 };
diff --git a/flow_table.h b/flow_table.h
index eecf8844..b7e5529a 100644
--- a/flow_table.h
+++ b/flow_table.h
@@ -8,6 +8,7 @@
 #define FLOW_TABLE_H
 
 #include "tcp_conn.h"
+#include "icmp_flow.h"
 
 /**
  * struct flow_free_cluster - Information about a cluster of free entries
@@ -33,6 +34,7 @@ union flow {
 	struct flow_free_cluster free;
 	struct tcp_tap_conn tcp;
 	struct tcp_splice_conn tcp_splice;
+	struct icmp_ping_flow ping;
 };
 
 /* Global Flow Table */
diff --git a/icmp.c b/icmp.c
index fb2fcafc..1caf791d 100644
--- a/icmp.c
+++ b/icmp.c
@@ -39,24 +39,17 @@
 #include "siphash.h"
 #include "inany.h"
 #include "icmp.h"
+#include "flow_table.h"
 
 #define ICMP_ECHO_TIMEOUT	60 /* s, timeout for ICMP socket activity */
 #define ICMP_NUM_IDS		(1U << 16)
 
-/**
- * struct icmp_id_sock - Tracking information for single ICMP echo identifier
- * @sock:	Bound socket for identifier
- * @seq:	Last sequence number sent to tap, host order, -1: not sent yet
- * @ts:		Last associated activity from tap, seconds
- */
-struct icmp_id_sock {
-	int sock;
-	int seq;
-	time_t ts;
-};
+/* Sides of a flow as we use them for ping streams */
+#define	SOCKSIDE	0
+#define	TAPSIDE		1
 
 /* Indexed by ICMP echo identifier */
-static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS];
+static struct icmp_ping_flow *icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS];
 
 /**
  * icmp_sock_handler() - Handle new data from ICMP or ICMPv6 socket
@@ -66,8 +59,8 @@ static struct icmp_id_sock icmp_id_map[IP_VERSIONS][ICMP_NUM_IDS];
  */
 void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref)
 {
-	struct icmp_id_sock *const id_sock = af == AF_INET
-		? &icmp_id_map[V4][ref.icmp.id] : &icmp_id_map[V6][ref.icmp.id];
+	struct icmp_ping_flow *pingf = af == AF_INET
+		? icmp_id_map[V4][ref.icmp.id] : icmp_id_map[V6][ref.icmp.id];
 	const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6";
 	union sockaddr_inany sr;
 	socklen_t sl = sizeof(sr);
@@ -78,6 +71,8 @@ void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref)
 	if (c->no_icmp)
 		return;
 
+	ASSERT(pingf);
+
 	n = recvfrom(ref.fd, buf, sizeof(buf), 0, &sr.sa, &sl);
 	if (n < 0) {
 		warn("%s: recvfrom() error on ping socket: %s",
@@ -112,10 +107,10 @@ void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref)
 
 	/* In PASTA mode, we'll get any reply we send, discard them. */
 	if (c->mode == MODE_PASTA) {
-		if (id_sock->seq == seq)
+		if (pingf->seq == seq)
 			return;
 
-		id_sock->seq = seq;
+		pingf->seq = seq;
 	}
 
 	debug("%s: echo reply to tap, ID: %"PRIu16", seq: %"PRIu16, pname,
@@ -132,16 +127,22 @@ unexpected:
 }
 
 /**
- * icmp_ping_close() - Close and clean up a ping socket
+ * icmp_ping_close() - Close and clean up a ping flow
  * @c:		Execution context
- * @id_sock:	Socket number and other info
+ * @pingf:	ping flow entry to close
  */
-static void icmp_ping_close(const struct ctx *c, struct icmp_id_sock *id_sock)
+static void icmp_ping_close(const struct ctx *c,
+			    const struct icmp_ping_flow *pingf)
 {
-	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, id_sock->sock, NULL);
-	close(id_sock->sock);
-	id_sock->sock = -1;
-	id_sock->seq = -1;
+	uint16_t id = pingf->id;
+
+	epoll_ctl(c->epollfd, EPOLL_CTL_DEL, pingf->sock, NULL);
+	close(pingf->sock);
+
+	if (pingf->f.type == FLOW_PING4)
+		icmp_id_map[V4][id] = NULL;
+	else
+		icmp_id_map[V6][id] = NULL;
 }
 
 /**
@@ -151,17 +152,27 @@ static void icmp_ping_close(const struct ctx *c, struct icmp_id_sock *id_sock)
  * @af:		Address family, AF_INET or AF_INET6
  * @id:		ICMP id for the new socket
  *
- * Return: Newly opened ping socket fd, or -1 on failure
+ * Return: Newly opened ping flow, or NULL on failure
  */
-static int icmp_ping_new(const struct ctx *c, struct icmp_id_sock *id_sock,
-			 sa_family_t af, uint16_t id)
+static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
+					    struct icmp_ping_flow **id_sock,
+					    sa_family_t af, uint16_t id)
 {
-	uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6;
 	const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6";
+	uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6;
 	union icmp_epoll_ref iref = { .id = id };
+	union flow *flow = flow_alloc();
+	struct icmp_ping_flow *pingf;
 	const void *bind_addr;
 	const char *bind_if;
-	int s;
+
+	if (!flow)
+		return NULL;
+
+	pingf = FLOW_START(flow, flowtype, ping, TAPSIDE);
+
+	pingf->seq = -1;
+	pingf->id = id;
 
 	if (af == AF_INET) {
 		bind_addr = &c->ip4.addr_out;
@@ -171,28 +182,28 @@ static int icmp_ping_new(const struct ctx *c, struct icmp_id_sock *id_sock,
 		bind_if = c->ip6.ifname_out;
 	}
 
-	s = sock_l4(c, af, proto, bind_addr, bind_if, 0, iref.u32);
+	pingf->sock = sock_l4(c, af, flow_proto[flowtype], bind_addr, bind_if,
+			      0, iref.u32);
 
-	if (s < 0) {
+	if (pingf->sock < 0) {
 		warn("Cannot open \"ping\" socket. You might need to:");
 		warn("  sysctl -w net.ipv4.ping_group_range=\"0 2147483647\"");
 		warn("...echo requests/replies will fail.");
 		goto cancel;
 	}
 
-	if (s > FD_REF_MAX)
+	if (pingf->sock > FD_REF_MAX)
 		goto cancel;
 
-	id_sock->sock = s;
+	*id_sock = pingf;
 
-	debug("%s: new socket %i for echo ID %"PRIu16, pname, s, id);
+	debug("%s: new socket %i for echo ID %"PRIu16, pname, pingf->sock, id);
 
-	return s;
+	return pingf;
 
 cancel:
-	if (s >= 0)
-		close(s);
-	return -1;
+	flow_alloc_cancel(flow);
+	return NULL;
 }
 
 /**
@@ -214,14 +225,13 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 	const char *const pname = af == AF_INET ? "ICMP" : "ICMPv6";
 	union sockaddr_inany sa = { .sa_family = af };
 	const socklen_t sl = af == AF_INET ? sizeof(sa.sa4) : sizeof(sa.sa6);
-	struct icmp_id_sock *id_sock;
+	struct icmp_ping_flow *pingf, **id_sock;
 	uint16_t id, seq;
 	size_t plen;
 	void *pkt;
-	int s;
 
 	(void)saddr;
-	(void)pif;
+	ASSERT(pif == PIF_TAP);
 
 	if (af == AF_INET) {
 		const struct icmphdr *ih;
@@ -260,13 +270,13 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		ASSERT(0);
 	}
 
-	if ((s = id_sock->sock) < 0)
-		if ((s = icmp_ping_new(c, id_sock, af, id)) < 0)
+	if (!(pingf = *id_sock))
+		if (!(pingf = icmp_ping_new(c, id_sock, af, id)))
 			return 1;
 
-	id_sock->ts = now->tv_sec;
+	pingf->ts = now->tv_sec;
 
-	if (sendto(s, pkt, plen, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
+	if (sendto(pingf->sock, pkt, plen, MSG_NOSIGNAL, &sa.sa, sl) < 0) {
 		debug("%s: failed to relay request to socket: %s",
 		      pname, strerror(errno));
 	} else {
@@ -278,44 +288,21 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 }
 
 /**
- * icmp_timer_one() - Handler for timed events related to a given identifier
- * @c:		Execution context
- * @id_sock:	Socket fd and activity timestamp
- * @now:	Current timestamp
- */
-static void icmp_timer_one(const struct ctx *c, struct icmp_id_sock *id_sock,
-			   const struct timespec *now)
-{
-	if (id_sock->sock < 0 || now->tv_sec - id_sock->ts <= ICMP_ECHO_TIMEOUT)
-		return;
-
-	icmp_ping_close(c, id_sock);
-}
-
-/**
- * icmp_timer() - Scan activity bitmap for identifiers with timed events
+ * icmp_ping_timer() - Handler for timed events related to a given flow
  * @c:		Execution context
+ * @flow:	flow table entry to check for timeout
  * @now:	Current timestamp
+ *
+ * Return: true if the flow is ready to free, false otherwise
  */
-void icmp_timer(const struct ctx *c, const struct timespec *now)
+bool icmp_ping_timer(const struct ctx *c, union flow *flow,
+		     const struct timespec *now)
 {
-	unsigned int i;
-
-	for (i = 0; i < ICMP_NUM_IDS; i++) {
-		icmp_timer_one(c, &icmp_id_map[V4][i], now);
-		icmp_timer_one(c, &icmp_id_map[V6][i], now);
-	}
-}
+	const struct icmp_ping_flow *pingf = &flow->ping;
 
-/**
- * icmp_init() - Initialise sequences in ID map to -1 (no sequence sent yet)
- */
-void icmp_init(void)
-{
-	unsigned i;
+	if (now->tv_sec - pingf->ts <= ICMP_ECHO_TIMEOUT)
+		return false;
 
-	for (i = 0; i < ICMP_NUM_IDS; i++) {
-		icmp_id_map[V4][i].seq = icmp_id_map[V6][i].seq = -1;
-		icmp_id_map[V4][i].sock = icmp_id_map[V6][i].sock = -1;
-	}
+	icmp_ping_close(c, pingf);
+	return true;
 }
diff --git a/icmp.h b/icmp.h
index 263101db..4f695da1 100644
--- a/icmp.h
+++ b/icmp.h
@@ -9,12 +9,12 @@
 #define ICMP_TIMER_INTERVAL		10000 /* ms */
 
 struct ctx;
+struct icmp_ping_flow;
 
 void icmp_sock_handler(const struct ctx *c, sa_family_t af, union epoll_ref ref);
 int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af,
 		     const void *saddr, const void *daddr,
 		     const struct pool *p, const struct timespec *now);
-void icmp_timer(const struct ctx *c, const struct timespec *now);
 void icmp_init(void);
 
 /**
diff --git a/icmp_flow.h b/icmp_flow.h
new file mode 100644
index 00000000..5a2eed9c
--- /dev/null
+++ b/icmp_flow.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright Red Hat
+ * Author: David Gibson <david@gibson.dropbear.id.au>
+ *
+ * ICMP flow tracking data structures
+ */
+#ifndef ICMP_FLOW_H
+#define ICMP_FLOW_H
+
+/**
+ * struct icmp_ping_flow - Descriptor for a flow of ping requests/replies
+ * @f:		Generic flow information
+ * @seq:	Last sequence number sent to tap, host order, -1: not sent yet
+ * @sock:	"ping" socket
+ * @ts:		Last associated activity from tap, seconds
+ * @id:		ICMP id for the flow as seen by the guest
+ */
+struct icmp_ping_flow {
+	/* Must be first element */
+	struct flow_common f;
+
+	int seq;
+	int sock;
+	time_t ts;
+	uint16_t id;
+};
+
+bool icmp_ping_timer(const struct ctx *c, union flow *flow,
+		     const struct timespec *now);
+
+#endif /* ICMP_FLOW_H */
diff --git a/passt.c b/passt.c
index a061f2ba..0f794927 100644
--- a/passt.c
+++ b/passt.c
@@ -106,8 +106,6 @@ static void post_handler(struct ctx *c, const struct timespec *now)
 	CALL_PROTO_HANDLER(c, now, tcp, TCP);
 	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
 	CALL_PROTO_HANDLER(c, now, udp, UDP);
-	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
-	CALL_PROTO_HANDLER(c, now, icmp, ICMP);
 
 	flow_defer_handler(c, now);
 #undef CALL_PROTO_HANDLER
@@ -288,9 +286,6 @@ int main(int argc, char **argv)
 	if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
 		exit(EXIT_FAILURE);
 
-	if (!c.no_icmp)
-		icmp_init();
-
 	proto_update_l2_buf(c.mac_guest, c.mac);
 
 	if (c.ifi4 && !c.no_dhcp)
-- 
@@ -106,8 +106,6 @@ static void post_handler(struct ctx *c, const struct timespec *now)
 	CALL_PROTO_HANDLER(c, now, tcp, TCP);
 	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
 	CALL_PROTO_HANDLER(c, now, udp, UDP);
-	/* NOLINTNEXTLINE(bugprone-branch-clone): intervals can be the same */
-	CALL_PROTO_HANDLER(c, now, icmp, ICMP);
 
 	flow_defer_handler(c, now);
 #undef CALL_PROTO_HANDLER
@@ -288,9 +286,6 @@ int main(int argc, char **argv)
 	if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
 		exit(EXIT_FAILURE);
 
-	if (!c.no_icmp)
-		icmp_init();
-
 	proto_update_l2_buf(c.mac_guest, c.mac);
 
 	if (c.ifi4 && !c.no_dhcp)
-- 
2.44.0


  reply	other threads:[~2024-02-29  4:15 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-29  4:15 [PATCH 0/3] Basic integration of ICMP with flow table David Gibson
2024-02-29  4:15 ` David Gibson [this message]
2024-03-07  8:53   ` [PATCH 1/3] icmp: Store ping socket information in " Stefano Brivio
2024-03-07 10:24     ` David Gibson
2024-03-07 23:25       ` Stefano Brivio
2024-03-08  0:48         ` David Gibson
2024-02-29  4:15 ` [PATCH 2/3] icmp: Flow based error reporting David Gibson
2024-03-07 23:26   ` Stefano Brivio
2024-03-08  0:49     ` David Gibson
2024-03-08  6:06       ` Stefano Brivio
2024-02-29  4:15 ` [PATCH 3/3] icmp: Use 'flowside' epoll references for ping sockets David Gibson
2024-03-13 10:08 ` [PATCH 0/3] Basic integration of ICMP with flow table Stefano Brivio

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240229041534.2573559-2-david@gibson.dropbear.id.au \
    --to=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).