public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
* [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets
@ 2022-11-04  8:43 David Gibson
  2022-11-04  8:43 ` [PATCH 01/10] tcp: no v6 flag in ref David Gibson
                   ` (10 more replies)
  0 siblings, 11 replies; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

We want to use the same sockets to listen on both IPv4 and IPv6
addresses (at least on Linux which allows this).  This isn't too
complicated conceptually, but to make it work we need to unify
handling of v4 and v6 connections more than we currently have.

This isn't really ready to go, but I'm posting for reference.  It's
divided into rather a lot of steps, because I kept hitting problems
and not being sure exactly what change had broken things.

Based on my earlier series making handling of IPv4 addresses endian
safer.

David Gibson (10):
  tcp: no v6 flag in ref
  tcp: Helper to encode IPv4-mapped IPv6 addresses
  tcp: Partially unify IPv4 and IPv6 paths in tcp_hash_match()
  tcp: Hash IPv4 and IPv4-mapped-IPv6 addresses the same
  tcp: Take tcp_hash_insert() address from struct tcp_conn
  tcp:  Unify IPv4 and IPv6 paths for hashing and matching
  tcp: Remove ugly address union from struct tcp_conn
  tcp: Unify initial sequence numbers for IPv4 and IPv6
  tcp: Have tcp_seq_init() take its parameters from struct tcp_conn
  tcp: Fix small error in tcp_seq_init() time handling

 siphash.c    |   2 +
 tcp.c        | 202 ++++++++++++++++++---------------------------------
 tcp.h        |   1 -
 tcp_splice.c |  13 ++--
 util.c       |  33 +++++++++
 util.h       |   2 +
 6 files changed, 116 insertions(+), 137 deletions(-)

-- 
2.38.1


^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 01/10] tcp: no v6 flag in ref
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-07 18:07   ` Stefano Brivio
  2022-11-04  8:43 ` [PATCH 02/10] tcp: Helper to encode IPv4-mapped IPv6 addresses David Gibson
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

---
 tcp.c        |  9 ++++-----
 tcp.h        |  1 -
 tcp_splice.c | 13 +++++++------
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/tcp.c b/tcp.c
index 713248f..3d48d6e 100644
--- a/tcp.c
+++ b/tcp.c
@@ -761,8 +761,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn)
 {
 	int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
 	union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
-				.r.p.tcp.tcp.index = conn - tc,
-				.r.p.tcp.tcp.v6 = CONN_V6(conn) };
+				.r.p.tcp.tcp.index = conn - tc, };
 	struct epoll_event ev = { .data.u64 = ref.u64 };
 
 	if (conn->events == CLOSED) {
@@ -2857,6 +2856,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 	if (c->tcp.conn_count >= TCP_MAX_CONNS)
 		return;
 
+	sa.ss_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */
 	sl = sizeof(sa);
 	s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
 	if (s < 0)
@@ -2868,7 +2868,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 	conn->ws_to_tap = conn->ws_from_tap = 0;
 	conn_event(c, conn, SOCK_ACCEPTED);
 
-	if (ref.r.p.tcp.tcp.v6) {
+	if (sa.ss_family == AF_INET6) {
 		struct sockaddr_in6 sa6;
 
 		memcpy(&sa6, &sa, sizeof(sa6));
@@ -3147,8 +3147,7 @@ static void tcp_sock_init6(const struct ctx *c, int ns,
 			   const struct in6_addr *addr, const char *ifname,
 			   in_port_t port)
 {
-	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns,
-				     .tcp.v6 = 1 };
+	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns, };
 	bool spliced = false, tap = true;
 	int s;
 
diff --git a/tcp.h b/tcp.h
index 3fabb5a..85ac750 100644
--- a/tcp.h
+++ b/tcp.h
@@ -45,7 +45,6 @@ union tcp_epoll_ref {
 		uint32_t	listen:1,
 				splice:1,
 				outbound:1,
-				v6:1,
 				timer:1,
 				index:20;
 	} tcp;
diff --git a/tcp_splice.c b/tcp_splice.c
index 99c5fa7..1f26ff4 100644
--- a/tcp_splice.c
+++ b/tcp_splice.c
@@ -211,12 +211,10 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
 	int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
 	union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
 				  .r.p.tcp.tcp.splice = 1,
-				  .r.p.tcp.tcp.index = conn - tc,
-				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
+				  .r.p.tcp.tcp.index = conn - tc, };
 	union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b,
 				  .r.p.tcp.tcp.splice = 1,
-				  .r.p.tcp.tcp.index = conn - tc,
-				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
+				  .r.p.tcp.tcp.index = conn - tc, };
 	struct epoll_event ev_a = { .data.u64 = ref_a.u64 };
 	struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
 	uint32_t events_a, events_b;
@@ -579,12 +577,15 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
 	struct tcp_splice_conn *conn;
 
 	if (ref.r.p.tcp.tcp.listen) {
+		struct sockaddr sa;
+		socklen_t sl = sizeof(sa);
 		int s;
 
 		if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS)
 			return;
 
-		if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
+		sa.sa_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */
+		if ((s = accept4(ref.r.s, &sa, &sl, SOCK_NONBLOCK)) < 0)
 			return;
 
 		if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }),
@@ -595,7 +596,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
 
 		conn = CONN(c->tcp.splice_conn_count++);
 		conn->a = s;
-		conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0;
+		conn->flags = (sa.sa_family == AF_INET6) ? SOCK_V6 : 0;
 
 		if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index,
 				   ref.r.p.tcp.tcp.outbound))
-- 
@@ -211,12 +211,10 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
 	int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
 	union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
 				  .r.p.tcp.tcp.splice = 1,
-				  .r.p.tcp.tcp.index = conn - tc,
-				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
+				  .r.p.tcp.tcp.index = conn - tc, };
 	union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b,
 				  .r.p.tcp.tcp.splice = 1,
-				  .r.p.tcp.tcp.index = conn - tc,
-				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
+				  .r.p.tcp.tcp.index = conn - tc, };
 	struct epoll_event ev_a = { .data.u64 = ref_a.u64 };
 	struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
 	uint32_t events_a, events_b;
@@ -579,12 +577,15 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
 	struct tcp_splice_conn *conn;
 
 	if (ref.r.p.tcp.tcp.listen) {
+		struct sockaddr sa;
+		socklen_t sl = sizeof(sa);
 		int s;
 
 		if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS)
 			return;
 
-		if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
+		sa.sa_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */
+		if ((s = accept4(ref.r.s, &sa, &sl, SOCK_NONBLOCK)) < 0)
 			return;
 
 		if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }),
@@ -595,7 +596,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
 
 		conn = CONN(c->tcp.splice_conn_count++);
 		conn->a = s;
-		conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0;
+		conn->flags = (sa.sa_family == AF_INET6) ? SOCK_V6 : 0;
 
 		if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index,
 				   ref.r.p.tcp.tcp.outbound))
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 02/10] tcp: Helper to encode IPv4-mapped IPv6 addresses
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
  2022-11-04  8:43 ` [PATCH 01/10] tcp: no v6 flag in ref David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-07 18:08   ` Stefano Brivio
  2022-11-04  8:43 ` [PATCH 03/10] tcp: Partially unify IPv4 and IPv6 paths in tcp_hash_match() David Gibson
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

In the tcp_conn structure we have space for an IPv6 address, and use
IPv4-mapped IPv6 addresses to describe IPv4 connections.  We open code
the construction of those IPv4-mapped address in two places.

Avoid the duplication with a helper function.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c  |  9 ++-------
 util.c | 20 ++++++++++++++++++++
 util.h |  1 +
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/tcp.c b/tcp.c
index 3d48d6e..26dd268 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2217,9 +2217,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
 		sa = (struct sockaddr *)&addr4;
 		sl = sizeof(addr4);
 
-		memset(&conn->a.a4.zero, 0,    sizeof(conn->a.a4.zero));
-		memset(&conn->a.a4.one,  0xff, sizeof(conn->a.a4.one));
-		memcpy(&conn->a.a4.a,    addr, sizeof(conn->a.a4.a));
+		encode_ip4mapped_ip6(&conn->a.a6, addr);
 	} else {
 		sa = (struct sockaddr *)&addr6;
 		sl = sizeof(addr6);
@@ -2902,15 +2900,12 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 
 		memcpy(&sa4, &sa, sizeof(sa4));
 
-		memset(&conn->a.a4.zero,   0, sizeof(conn->a.a4.zero));
-		memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one));
-
 		if (IN4_IS_ADDR_LOOPBACK(&sa4.sin_addr) ||
 		    IN4_IS_ADDR_UNSPECIFIED(&sa4.sin_addr) ||
 		    IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen))
 			sa4.sin_addr = c->ip4.gw;
 
-		conn->a.a4.a = sa4.sin_addr;
+		encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr);
 
 		conn->sock_port = ntohs(sa4.sin_port);
 		conn->tap_port = ref.r.p.tcp.tcp.index;
diff --git a/util.c b/util.c
index 514bd44..257d0b6 100644
--- a/util.c
+++ b/util.c
@@ -482,3 +482,23 @@ int write_file(const char *path, const char *buf)
 	close(fd);
 	return len == 0 ? 0 : -1;
 }
+
+struct ip4mapped_ip6 {
+	uint8_t zero[10];
+	uint8_t one[2];
+	struct in_addr a4;
+};
+
+/**
+ * encode_ip4mapped_ip6() - Convert an IPv4 address to an IPv4-mapped IPv6 address
+ * @ip6:	Buffer to store the IPv4-mapped IPv6 address
+ * @ip4:	IPv4 address, network order
+ */
+void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4)
+{
+	struct ip4mapped_ip6 *a = (struct ip4mapped_ip6 *)ip6;
+
+	memset(&a->zero, 0, sizeof(a->zero));
+	memset(&a->one, 0xff, sizeof(a->one));
+	memcpy(&a->a4, ip4, sizeof(a->a4));
+}
diff --git a/util.h b/util.h
index 2d4e1ff..f7d6a6f 100644
--- a/util.h
+++ b/util.h
@@ -209,5 +209,6 @@ void write_pidfile(int fd, pid_t pid);
 int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
+void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4);
 
 #endif /* UTIL_H */
-- 
@@ -209,5 +209,6 @@ void write_pidfile(int fd, pid_t pid);
 int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
+void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4);
 
 #endif /* UTIL_H */
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 03/10] tcp: Partially unify IPv4 and IPv6 paths in tcp_hash_match()
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
  2022-11-04  8:43 ` [PATCH 01/10] tcp: no v6 flag in ref David Gibson
  2022-11-04  8:43 ` [PATCH 02/10] tcp: Helper to encode IPv4-mapped IPv6 addresses David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-07 18:08   ` Stefano Brivio
  2022-11-04  8:43 ` [PATCH 04/10] tcp: Hash IPv4 and IPv4-mapped-IPv6 addresses the same David Gibson
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

When given an IPv4 address tcp_hash_match() checks if the connection
stores an IPv4-mapped IPv6 address, and if so compares the mapped part of
that address to the given address.  This is equivalent to converting a
given IPv4 address to an IPv4-mapped IPv6 address then comparing it to the
connection address using the existing IPv6 logic.

Convert to this slightly simpler form, which will also allow some further
simplifications in future.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tcp.c b/tcp.c
index 26dd268..508d6e9 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1285,13 +1285,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
 static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
 			  in_port_t tap_port, in_port_t sock_port)
 {
-	if (af == AF_INET && CONN_V4(conn)			&&
-	    !memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a))	&&
-	    conn->tap_port == tap_port && conn->sock_port == sock_port)
-		return 1;
+	struct in6_addr v4mapped;
+
+	if (af == AF_INET) {
+		encode_ip4mapped_ip6(&v4mapped, addr);
+		addr = &v4mapped;
+	}
 
-	if (af == AF_INET6					&&
-	    IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
+	if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
 	    conn->tap_port == tap_port && conn->sock_port == sock_port)
 		return 1;
 
-- 
@@ -1285,13 +1285,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
 static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
 			  in_port_t tap_port, in_port_t sock_port)
 {
-	if (af == AF_INET && CONN_V4(conn)			&&
-	    !memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a))	&&
-	    conn->tap_port == tap_port && conn->sock_port == sock_port)
-		return 1;
+	struct in6_addr v4mapped;
+
+	if (af == AF_INET) {
+		encode_ip4mapped_ip6(&v4mapped, addr);
+		addr = &v4mapped;
+	}
 
-	if (af == AF_INET6					&&
-	    IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
+	if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
 	    conn->tap_port == tap_port && conn->sock_port == sock_port)
 		return 1;
 
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 04/10] tcp: Hash IPv4 and IPv4-mapped-IPv6 addresses the same
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
                   ` (2 preceding siblings ...)
  2022-11-04  8:43 ` [PATCH 03/10] tcp: Partially unify IPv4 and IPv6 paths in tcp_hash_match() David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-04  8:43 ` [PATCH 05/10] tcp: Take tcp_hash_insert() address from struct tcp_conn David Gibson
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

In the tcp_conn structure, we represent IPv4 connections with IPv4-mapped
IPv6 addresses in the single address field.  However, we have different
paths which will calculate different hashes for IPv4 and equivalent
IPv4-mapped IPv6 addresses.  This will cause problems for some future
changes.

Make the hash function work the same for these two cases, by always mapping
IPv4 addresses into IPv4-mapped addresses before hashing.  This involves
some ugly temporaries, but later changes should improve this again.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 siphash.c |  1 +
 tcp.c     | 33 +++++++++++++++------------------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/siphash.c b/siphash.c
index 37a6d73..516a508 100644
--- a/siphash.c
+++ b/siphash.c
@@ -104,6 +104,7 @@
  *
  * Return: the 64-bit hash output
  */
+/* cppcheck-suppress unusedFunction */
 uint64_t siphash_8b(const uint8_t *in, const uint64_t *k)
 {
 	PREAMBLE(8);
diff --git a/tcp.c b/tcp.c
index 508d6e9..4645004 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1315,30 +1315,27 @@ __attribute__((__noinline__))	/* See comment in Makefile */
 static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr,
 			     in_port_t tap_port, in_port_t sock_port)
 {
+	struct {
+		struct in6_addr addr;
+		in_port_t tap_port;
+		in_port_t sock_port;
+	} __attribute__((__packed__)) in = {
+		.tap_port = tap_port,
+		.sock_port = sock_port,
+	};
 	uint64_t b = 0;
 
 	if (af == AF_INET) {
-		struct {
-			struct in_addr addr;
-			in_port_t tap_port;
-			in_port_t sock_port;
-		} __attribute__((__packed__)) in = {
-			*(struct in_addr *)addr, tap_port, sock_port,
-		};
-
-		b = siphash_8b((uint8_t *)&in, c->tcp.hash_secret);
-	} else if (af == AF_INET6) {
-		struct {
-			struct in6_addr addr;
-			in_port_t tap_port;
-			in_port_t sock_port;
-		} __attribute__((__packed__)) in = {
-			*(struct in6_addr *)addr, tap_port, sock_port,
-		};
+		struct in6_addr v4mapped;
 
-		b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret);
+		encode_ip4mapped_ip6(&v4mapped, addr);
+		in.addr = v4mapped;
+	} else {
+		in.addr = *(struct in6_addr *)addr;
 	}
 
+	b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret);
+
 	return (unsigned int)(b % TCP_HASH_TABLE_SIZE);
 }
 
-- 
@@ -1315,30 +1315,27 @@ __attribute__((__noinline__))	/* See comment in Makefile */
 static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr,
 			     in_port_t tap_port, in_port_t sock_port)
 {
+	struct {
+		struct in6_addr addr;
+		in_port_t tap_port;
+		in_port_t sock_port;
+	} __attribute__((__packed__)) in = {
+		.tap_port = tap_port,
+		.sock_port = sock_port,
+	};
 	uint64_t b = 0;
 
 	if (af == AF_INET) {
-		struct {
-			struct in_addr addr;
-			in_port_t tap_port;
-			in_port_t sock_port;
-		} __attribute__((__packed__)) in = {
-			*(struct in_addr *)addr, tap_port, sock_port,
-		};
-
-		b = siphash_8b((uint8_t *)&in, c->tcp.hash_secret);
-	} else if (af == AF_INET6) {
-		struct {
-			struct in6_addr addr;
-			in_port_t tap_port;
-			in_port_t sock_port;
-		} __attribute__((__packed__)) in = {
-			*(struct in6_addr *)addr, tap_port, sock_port,
-		};
+		struct in6_addr v4mapped;
 
-		b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret);
+		encode_ip4mapped_ip6(&v4mapped, addr);
+		in.addr = v4mapped;
+	} else {
+		in.addr = *(struct in6_addr *)addr;
 	}
 
+	b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret);
+
 	return (unsigned int)(b % TCP_HASH_TABLE_SIZE);
 }
 
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 05/10] tcp: Take tcp_hash_insert() address from struct tcp_conn
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
                   ` (3 preceding siblings ...)
  2022-11-04  8:43 ` [PATCH 04/10] tcp: Hash IPv4 and IPv4-mapped-IPv6 addresses the same David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-04  8:43 ` [PATCH 06/10] tcp: Unify IPv4 and IPv6 paths for hashing and matching David Gibson
                   ` (5 subsequent siblings)
  10 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

tcp_hash_insert() takes an address to control which hash bucket the
connection will go into.  For IPv6 connections that address is alreadys
stored in struct tcp_conn.  For IPv4 connections, an equivalent IPv4-mapped
IPv6 address is stored in struct tcp_conn.

Now that we've made the hashing of IPv4 and IPv4-mapped IPv6 addresses
equivalent, we can simplify tcp_hash_insert() to use the address in
struct tcp_conn, rather than taking it as an extra parameter.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/tcp.c b/tcp.c
index 4645004..dfa73a3 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1343,15 +1343,12 @@ static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr,
  * tcp_hash_insert() - Insert connection into hash table, chain link
  * @c:		Execution context
  * @conn:	Connection pointer
- * @af:		Address family, AF_INET or AF_INET6
- * @addr:	Remote address, pointer to in_addr or in6_addr
  */
-static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn,
-			    int af, const void *addr)
+static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn)
 {
 	int b;
 
-	b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port);
+	b = tcp_hash(c, AF_INET6, &conn->a.a6, conn->tap_port, conn->sock_port);
 	conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1;
 	tc_hash[b] = conn;
 	conn->hash_bucket = b;
@@ -2233,7 +2230,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
 	conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now);
 	conn->seq_ack_from_tap = conn->seq_to_tap + 1;
 
-	tcp_hash_insert(c, conn, af, addr);
+	tcp_hash_insert(c, conn);
 
 	if (!bind(s, sa, sl)) {
 		tcp_rst(c, conn);	/* Nobody is listening then */
@@ -2891,8 +2888,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 						conn->sock_port,
 						conn->tap_port,
 						now);
-
-		tcp_hash_insert(c, conn, AF_INET6, &sa6.sin6_addr);
 	} else {
 		struct sockaddr_in sa4;
 
@@ -2912,10 +2907,10 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 						conn->sock_port,
 						conn->tap_port,
 						now);
-
-		tcp_hash_insert(c, conn, AF_INET, &sa4.sin_addr);
 	}
 
+	tcp_hash_insert(c, conn);
+
 	conn->seq_ack_from_tap = conn->seq_to_tap + 1;
 
 	conn->wnd_from_tap = WINDOW_DEFAULT;
-- 
@@ -1343,15 +1343,12 @@ static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr,
  * tcp_hash_insert() - Insert connection into hash table, chain link
  * @c:		Execution context
  * @conn:	Connection pointer
- * @af:		Address family, AF_INET or AF_INET6
- * @addr:	Remote address, pointer to in_addr or in6_addr
  */
-static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn,
-			    int af, const void *addr)
+static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn)
 {
 	int b;
 
-	b = tcp_hash(c, af, addr, conn->tap_port, conn->sock_port);
+	b = tcp_hash(c, AF_INET6, &conn->a.a6, conn->tap_port, conn->sock_port);
 	conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1;
 	tc_hash[b] = conn;
 	conn->hash_bucket = b;
@@ -2233,7 +2230,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
 	conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now);
 	conn->seq_ack_from_tap = conn->seq_to_tap + 1;
 
-	tcp_hash_insert(c, conn, af, addr);
+	tcp_hash_insert(c, conn);
 
 	if (!bind(s, sa, sl)) {
 		tcp_rst(c, conn);	/* Nobody is listening then */
@@ -2891,8 +2888,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 						conn->sock_port,
 						conn->tap_port,
 						now);
-
-		tcp_hash_insert(c, conn, AF_INET6, &sa6.sin6_addr);
 	} else {
 		struct sockaddr_in sa4;
 
@@ -2912,10 +2907,10 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 						conn->sock_port,
 						conn->tap_port,
 						now);
-
-		tcp_hash_insert(c, conn, AF_INET, &sa4.sin_addr);
 	}
 
+	tcp_hash_insert(c, conn);
+
 	conn->seq_ack_from_tap = conn->seq_to_tap + 1;
 
 	conn->wnd_from_tap = WINDOW_DEFAULT;
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 06/10] tcp:  Unify IPv4 and IPv6 paths for hashing and matching
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
                   ` (4 preceding siblings ...)
  2022-11-04  8:43 ` [PATCH 05/10] tcp: Take tcp_hash_insert() address from struct tcp_conn David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-04  8:43 ` [PATCH 07/10] tcp: Remove ugly address union from struct tcp_conn David Gibson
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

IPv4 are now hashed to match equivalent IPv4-mapped IPv6 addresses.  This
means we can avoid having IPv4 specific paths in the lower level hash and
match functions, instead just dealing with the equivalent IPv4-mapped IPv6
addresses.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 44 +++++++++++++++++---------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/tcp.c b/tcp.c
index dfa73a3..3816a1c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1275,23 +1275,16 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
 /**
  * tcp_hash_match() - Check if a connection entry matches address and ports
  * @conn:	Connection entry to match against
- * @af:		Address family, AF_INET or AF_INET6
- * @addr:	Remote address, pointer to in_addr or in6_addr
+ * @addr:	Remote address, may be IPv4-mapped
  * @tap_port:	tap-facing port
  * @sock_port:	Socket-facing port
  *
  * Return: 1 on match, 0 otherwise
  */
-static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
+static int tcp_hash_match(const struct tcp_conn *conn,
+			  const struct in6_addr *addr,
 			  in_port_t tap_port, in_port_t sock_port)
 {
-	struct in6_addr v4mapped;
-
-	if (af == AF_INET) {
-		encode_ip4mapped_ip6(&v4mapped, addr);
-		addr = &v4mapped;
-	}
-
 	if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
 	    conn->tap_port == tap_port && conn->sock_port == sock_port)
 		return 1;
@@ -1302,8 +1295,7 @@ static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
 /**
  * tcp_hash() - Calculate hash value for connection given address and ports
  * @c:		Execution context
- * @af:		Address family, AF_INET or AF_INET6
- * @addr:	Remote address, pointer to in_addr or in6_addr
+ * @addr:	Remote address, may be IPv4-mapped
  * @tap_port:	tap-facing port
  * @sock_port:	Socket-facing port
  *
@@ -1312,7 +1304,7 @@ static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
 #if TCP_HASH_NOINLINE
 __attribute__((__noinline__))	/* See comment in Makefile */
 #endif
-static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr,
+static unsigned int tcp_hash(const struct ctx *c, const struct in6_addr *addr,
 			     in_port_t tap_port, in_port_t sock_port)
 {
 	struct {
@@ -1320,20 +1312,10 @@ static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr,
 		in_port_t tap_port;
 		in_port_t sock_port;
 	} __attribute__((__packed__)) in = {
-		.tap_port = tap_port,
-		.sock_port = sock_port,
+		*addr, tap_port, sock_port,
 	};
 	uint64_t b = 0;
 
-	if (af == AF_INET) {
-		struct in6_addr v4mapped;
-
-		encode_ip4mapped_ip6(&v4mapped, addr);
-		in.addr = v4mapped;
-	} else {
-		in.addr = *(struct in6_addr *)addr;
-	}
-
 	b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret);
 
 	return (unsigned int)(b % TCP_HASH_TABLE_SIZE);
@@ -1348,7 +1330,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn)
 {
 	int b;
 
-	b = tcp_hash(c, AF_INET6, &conn->a.a6, conn->tap_port, conn->sock_port);
+	b = tcp_hash(c, &conn->a.a6, conn->tap_port, conn->sock_port);
 	conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1;
 	tc_hash[b] = conn;
 	conn->hash_bucket = b;
@@ -1422,11 +1404,19 @@ static struct tcp_conn *tcp_hash_lookup(const struct ctx *c, int af,
 					const void *addr,
 					in_port_t tap_port, in_port_t sock_port)
 {
-	int b = tcp_hash(c, af, addr, tap_port, sock_port);
+	struct in6_addr v4mapped;
 	struct tcp_conn *conn;
+	int b;
+
+	if (af == AF_INET) {
+		encode_ip4mapped_ip6(&v4mapped, addr);
+		addr = &v4mapped;
+	}
+
+	b = tcp_hash(c, addr, tap_port, sock_port);
 
 	for (conn = tc_hash[b]; conn; conn = CONN_OR_NULL(conn->next_index)) {
-		if (tcp_hash_match(conn, af, addr, tap_port, sock_port))
+		if (tcp_hash_match(conn, addr, tap_port, sock_port))
 			return conn;
 	}
 
-- 
@@ -1275,23 +1275,16 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
 /**
  * tcp_hash_match() - Check if a connection entry matches address and ports
  * @conn:	Connection entry to match against
- * @af:		Address family, AF_INET or AF_INET6
- * @addr:	Remote address, pointer to in_addr or in6_addr
+ * @addr:	Remote address, may be IPv4-mapped
  * @tap_port:	tap-facing port
  * @sock_port:	Socket-facing port
  *
  * Return: 1 on match, 0 otherwise
  */
-static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
+static int tcp_hash_match(const struct tcp_conn *conn,
+			  const struct in6_addr *addr,
 			  in_port_t tap_port, in_port_t sock_port)
 {
-	struct in6_addr v4mapped;
-
-	if (af == AF_INET) {
-		encode_ip4mapped_ip6(&v4mapped, addr);
-		addr = &v4mapped;
-	}
-
 	if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
 	    conn->tap_port == tap_port && conn->sock_port == sock_port)
 		return 1;
@@ -1302,8 +1295,7 @@ static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
 /**
  * tcp_hash() - Calculate hash value for connection given address and ports
  * @c:		Execution context
- * @af:		Address family, AF_INET or AF_INET6
- * @addr:	Remote address, pointer to in_addr or in6_addr
+ * @addr:	Remote address, may be IPv4-mapped
  * @tap_port:	tap-facing port
  * @sock_port:	Socket-facing port
  *
@@ -1312,7 +1304,7 @@ static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
 #if TCP_HASH_NOINLINE
 __attribute__((__noinline__))	/* See comment in Makefile */
 #endif
-static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr,
+static unsigned int tcp_hash(const struct ctx *c, const struct in6_addr *addr,
 			     in_port_t tap_port, in_port_t sock_port)
 {
 	struct {
@@ -1320,20 +1312,10 @@ static unsigned int tcp_hash(const struct ctx *c, int af, const void *addr,
 		in_port_t tap_port;
 		in_port_t sock_port;
 	} __attribute__((__packed__)) in = {
-		.tap_port = tap_port,
-		.sock_port = sock_port,
+		*addr, tap_port, sock_port,
 	};
 	uint64_t b = 0;
 
-	if (af == AF_INET) {
-		struct in6_addr v4mapped;
-
-		encode_ip4mapped_ip6(&v4mapped, addr);
-		in.addr = v4mapped;
-	} else {
-		in.addr = *(struct in6_addr *)addr;
-	}
-
 	b = siphash_20b((uint8_t *)&in, c->tcp.hash_secret);
 
 	return (unsigned int)(b % TCP_HASH_TABLE_SIZE);
@@ -1348,7 +1330,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn)
 {
 	int b;
 
-	b = tcp_hash(c, AF_INET6, &conn->a.a6, conn->tap_port, conn->sock_port);
+	b = tcp_hash(c, &conn->a.a6, conn->tap_port, conn->sock_port);
 	conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1;
 	tc_hash[b] = conn;
 	conn->hash_bucket = b;
@@ -1422,11 +1404,19 @@ static struct tcp_conn *tcp_hash_lookup(const struct ctx *c, int af,
 					const void *addr,
 					in_port_t tap_port, in_port_t sock_port)
 {
-	int b = tcp_hash(c, af, addr, tap_port, sock_port);
+	struct in6_addr v4mapped;
 	struct tcp_conn *conn;
+	int b;
+
+	if (af == AF_INET) {
+		encode_ip4mapped_ip6(&v4mapped, addr);
+		addr = &v4mapped;
+	}
+
+	b = tcp_hash(c, addr, tap_port, sock_port);
 
 	for (conn = tc_hash[b]; conn; conn = CONN_OR_NULL(conn->next_index)) {
-		if (tcp_hash_match(conn, af, addr, tap_port, sock_port))
+		if (tcp_hash_match(conn, addr, tap_port, sock_port))
 			return conn;
 	}
 
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 07/10] tcp: Remove ugly address union from struct tcp_conn
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
                   ` (5 preceding siblings ...)
  2022-11-04  8:43 ` [PATCH 06/10] tcp: Unify IPv4 and IPv6 paths for hashing and matching David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-07 18:08   ` Stefano Brivio
  2022-11-04  8:43 ` [PATCH 08/10] tcp: Unify initial sequence numbers for IPv4 and IPv6 David Gibson
                   ` (3 subsequent siblings)
  10 siblings, 1 reply; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

We now only extract the v4 part of the tcp_conn::a union in one place.
Make this neater by using a helper to extract the IPv4 address from the
mapped address in that place.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c  | 40 +++++++++++++++-------------------------
 util.c | 13 +++++++++++++
 util.h |  1 +
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/tcp.c b/tcp.c
index 3816a1c..6634abb 100644
--- a/tcp.c
+++ b/tcp.c
@@ -416,10 +416,7 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
  * @ws_to_tap:		Window scaling factor advertised to tap/guest
  * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
  * @seq_dup_ack_approx:	Last duplicate ACK number sent to tap
- * @a.a6:		IPv6 remote address, can be IPv4-mapped
- * @a.a4.zero:		Zero prefix for IPv4-mapped, see RFC 6890, Table 20
- * @a.a4.one:		Ones prefix for IPv4-mapped
- * @a.a4.a:		IPv4 address
+ * @addr:		IPv6 remote address, can be IPv4-mapped
  * @tap_port:		Guest-facing tap port
  * @sock_port:		Remote, socket-facing port
  * @wnd_from_tap:	Last window size from tap, unscaled (as received)
@@ -489,15 +486,8 @@ struct tcp_conn {
 	uint8_t		seq_dup_ack_approx;
 
 
-	union {
-		struct in6_addr a6;
-		struct {
-			uint8_t zero[10];
-			uint8_t one[2];
-			struct in_addr a;
-		} a4;
-	} a;
-#define CONN_V4(conn)		IN6_IS_ADDR_V4MAPPED(&conn->a.a6)
+	struct in6_addr addr;
+#define CONN_V4(conn)		IN6_IS_ADDR_V4MAPPED(&conn->addr)
 #define CONN_V6(conn)		(!CONN_V4(conn))
 
 	in_port_t	tap_port;
@@ -960,7 +950,7 @@ static int tcp_rtt_dst_low(const struct tcp_conn *conn)
 	int i;
 
 	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
-		if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
+		if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i))
 			return 1;
 
 	return 0;
@@ -982,7 +972,7 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn,
 		return;
 
 	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
-		if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
+		if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i))
 			return;
 		if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
 			hole = i;
@@ -994,10 +984,10 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn,
 	if (hole == -1)
 		return;
 
-	memcpy(low_rtt_dst + hole++, &conn->a.a6, sizeof(conn->a.a6));
+	memcpy(low_rtt_dst + hole++, &conn->addr, sizeof(conn->addr));
 	if (hole == LOW_RTT_TABLE_SIZE)
 		hole = 0;
-	memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->a.a6));
+	memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->addr));
 #else
 	(void)conn;
 	(void)tinfo;
@@ -1285,7 +1275,7 @@ static int tcp_hash_match(const struct tcp_conn *conn,
 			  const struct in6_addr *addr,
 			  in_port_t tap_port, in_port_t sock_port)
 {
-	if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
+	if (IN6_ARE_ADDR_EQUAL(&conn->addr, addr)		&&
 	    conn->tap_port == tap_port && conn->sock_port == sock_port)
 		return 1;
 
@@ -1330,7 +1320,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn)
 {
 	int b;
 
-	b = tcp_hash(c, &conn->a.a6, conn->tap_port, conn->sock_port);
+	b = tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port);
 	conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1;
 	tc_hash[b] = conn;
 	conn->hash_bucket = b;
@@ -1660,7 +1650,7 @@ do {									\
 		ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
 
 		b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr));
-		b->ip6h.saddr = conn->a.a6;
+		b->ip6h.saddr = conn->addr;
 		if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr))
 			b->ip6h.daddr = c->ip6.addr_ll_seen;
 		else
@@ -1684,7 +1674,7 @@ do {									\
 
 		ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr);
 		b->iph.tot_len = htons(ip_len);
-		b->iph.saddr = conn->a.a4.a.s_addr;
+		b->iph.saddr = decode_ip4mapped_ip6(&conn->addr).s_addr;
 		b->iph.daddr = c->ip4.addr_seen.s_addr;
 
 		if (check)
@@ -2202,12 +2192,12 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
 		sa = (struct sockaddr *)&addr4;
 		sl = sizeof(addr4);
 
-		encode_ip4mapped_ip6(&conn->a.a6, addr);
+		encode_ip4mapped_ip6(&conn->addr, addr);
 	} else {
 		sa = (struct sockaddr *)&addr6;
 		sl = sizeof(addr6);
 
-		memcpy(&conn->a.a6,      addr, sizeof(conn->a.a6));
+		memcpy(&conn->addr,      addr, sizeof(conn->addr));
 	}
 
 	conn->sock_port = ntohs(th->dest);
@@ -2869,7 +2859,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 			memcpy(&sa6.sin6_addr, src, sizeof(*src));
 		}
 
-		memcpy(&conn->a.a6, &sa6.sin6_addr, sizeof(conn->a.a6));
+		memcpy(&conn->addr, &sa6.sin6_addr, sizeof(conn->addr));
 
 		conn->sock_port = ntohs(sa6.sin6_port);
 		conn->tap_port = ref.r.p.tcp.tcp.index;
@@ -2888,7 +2878,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 		    IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen))
 			sa4.sin_addr = c->ip4.gw;
 
-		encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr);
+		encode_ip4mapped_ip6(&conn->addr, &sa4.sin_addr);
 
 		conn->sock_port = ntohs(sa4.sin_port);
 		conn->tap_port = ref.r.p.tcp.tcp.index;
diff --git a/util.c b/util.c
index 257d0b6..ec7f57f 100644
--- a/util.c
+++ b/util.c
@@ -502,3 +502,16 @@ void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4)
 	memset(&a->one, 0xff, sizeof(a->one));
 	memcpy(&a->a4, ip4, sizeof(a->a4));
 }
+
+/**
+ * decode_ip4mapped_ip6() - Decode an IPv4-mapped IPv6 address
+ * @ip6:	IPv6 address
+ *
+ * Returns:	IPv4 address
+ */
+struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6)
+{
+	const struct ip4mapped_ip6 *a = (const struct ip4mapped_ip6 *)ip6;
+
+	return a->a4;
+}
diff --git a/util.h b/util.h
index f7d6a6f..103211d 100644
--- a/util.h
+++ b/util.h
@@ -210,5 +210,6 @@ int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
 void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4);
+struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6);
 
 #endif /* UTIL_H */
-- 
@@ -210,5 +210,6 @@ int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
 void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4);
+struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6);
 
 #endif /* UTIL_H */
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 08/10] tcp: Unify initial sequence numbers for IPv4 and IPv6
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
                   ` (6 preceding siblings ...)
  2022-11-04  8:43 ` [PATCH 07/10] tcp: Remove ugly address union from struct tcp_conn David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-04  8:43 ` [PATCH 09/10] tcp: Have tcp_seq_init() take its parameters from struct tcp_conn David Gibson
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

tcp_seq_init() has separate paths for IPv4 and IPv6 addresses.  Convert it
to convert IPv4 addresses to IPv4-mapped IPv6 addresses then compute the
siphash as for IPv6.  This is slightly simpler, and means that "true" IPv4
connections and "IPv6" connections using mapped addresses will have
compatible sequence numbers.  This will allow additional improvements in
future.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 siphash.c |  1 +
 tcp.c     | 46 +++++++++++++++++++---------------------------
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/siphash.c b/siphash.c
index 516a508..811918b 100644
--- a/siphash.c
+++ b/siphash.c
@@ -123,6 +123,7 @@ uint64_t siphash_8b(const uint8_t *in, const uint64_t *k)
  *
  * Return: 32 bits obtained by XORing the two halves of the 64-bit hash output
  */
+/* cppcheck-suppress unusedFunction */
 uint32_t siphash_12b(const uint8_t *in, const uint64_t *k)
 {
 	uint32_t *in32 = (uint32_t *)in;
diff --git a/tcp.c b/tcp.c
index 6634abb..b9d0510 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2011,38 +2011,30 @@ static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr,
 			     in_port_t dstport, in_port_t srcport,
 			     const struct timespec *now)
 {
+	struct {
+		struct in6_addr src;
+		in_port_t srcport;
+		struct in6_addr dst;
+		in_port_t dstport;
+	} __attribute__((__packed__)) in = {
+		.srcport = srcport,
+		.dstport = dstport,
+	};
 	uint32_t ns, seq = 0;
 
 	if (af == AF_INET) {
-		struct {
-			struct in_addr src;
-			in_port_t srcport;
-			struct in_addr dst;
-			in_port_t dstport;
-		} __attribute__((__packed__)) in = {
-			.src = *(struct in_addr *)addr,
-			.srcport = srcport,
-			.dst = c->ip4.addr,
-			.dstport = dstport,
-		};
-
-		seq = siphash_12b((uint8_t *)&in, c->tcp.hash_secret);
-	} else if (af == AF_INET6) {
-		struct {
-			struct in6_addr src;
-			in_port_t srcport;
-			struct in6_addr dst;
-			in_port_t dstport;
-		} __attribute__((__packed__)) in = {
-			.src = *(struct in6_addr *)addr,
-			.srcport = srcport,
-			.dst = c->ip6.addr,
-			.dstport = dstport,
-		};
-
-		seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
+		struct in6_addr tmp;
+		encode_ip4mapped_ip6(&tmp, addr);
+		in.src = tmp;
+		encode_ip4mapped_ip6(&tmp, &c->ip4.addr);
+		in.dst = tmp;
+	} else {
+		in.src = *(struct in6_addr *)addr;
+		in.dst = c->ip6.addr;
 	}
 
+	seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
+
 	ns = now->tv_sec * 1E9;
 	ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */
 
-- 
@@ -2011,38 +2011,30 @@ static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr,
 			     in_port_t dstport, in_port_t srcport,
 			     const struct timespec *now)
 {
+	struct {
+		struct in6_addr src;
+		in_port_t srcport;
+		struct in6_addr dst;
+		in_port_t dstport;
+	} __attribute__((__packed__)) in = {
+		.srcport = srcport,
+		.dstport = dstport,
+	};
 	uint32_t ns, seq = 0;
 
 	if (af == AF_INET) {
-		struct {
-			struct in_addr src;
-			in_port_t srcport;
-			struct in_addr dst;
-			in_port_t dstport;
-		} __attribute__((__packed__)) in = {
-			.src = *(struct in_addr *)addr,
-			.srcport = srcport,
-			.dst = c->ip4.addr,
-			.dstport = dstport,
-		};
-
-		seq = siphash_12b((uint8_t *)&in, c->tcp.hash_secret);
-	} else if (af == AF_INET6) {
-		struct {
-			struct in6_addr src;
-			in_port_t srcport;
-			struct in6_addr dst;
-			in_port_t dstport;
-		} __attribute__((__packed__)) in = {
-			.src = *(struct in6_addr *)addr,
-			.srcport = srcport,
-			.dst = c->ip6.addr,
-			.dstport = dstport,
-		};
-
-		seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
+		struct in6_addr tmp;
+		encode_ip4mapped_ip6(&tmp, addr);
+		in.src = tmp;
+		encode_ip4mapped_ip6(&tmp, &c->ip4.addr);
+		in.dst = tmp;
+	} else {
+		in.src = *(struct in6_addr *)addr;
+		in.dst = c->ip6.addr;
 	}
 
+	seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
+
 	ns = now->tv_sec * 1E9;
 	ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */
 
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 09/10] tcp: Have tcp_seq_init() take its parameters from struct tcp_conn
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
                   ` (7 preceding siblings ...)
  2022-11-04  8:43 ` [PATCH 08/10] tcp: Unify initial sequence numbers for IPv4 and IPv6 David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-04  8:43 ` [PATCH 10/10] tcp: Fix small error in tcp_seq_init() time handling David Gibson
  2022-11-04  8:47 ` [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets Stefano Brivio
  10 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

tcp_seq_init() takes a number of parameters for the connection, but at
every call site, these are already populated in the tcp_conn structure.
Likewise we always store the result into the @seq_to_tap field.

Use this to simplify tcp_seq_init().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 43 +++++++++++++------------------------------
 1 file changed, 13 insertions(+), 30 deletions(-)

diff --git a/tcp.c b/tcp.c
index b9d0510..59e03ff 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1999,17 +1999,11 @@ static void tcp_clamp_window(const struct ctx *c, struct tcp_conn *conn,
 /**
  * tcp_seq_init() - Calculate initial sequence number according to RFC 6528
  * @c:		Execution context
- * @af:		Address family, AF_INET or AF_INET6
- * @addr:	Remote address, pointer to in_addr or in6_addr
- * @dstport:	Destination port, connection-wise, network order
- * @srcport:	Source port, connection-wise, network order
+ * @conn:	TCP connection, with addr, sock_port and tap_port populated
  * @now:	Current timestamp
- *
- * Return: initial TCP sequence
  */
-static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr,
-			     in_port_t dstport, in_port_t srcport,
-			     const struct timespec *now)
+static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn,
+			 const struct timespec *now)
 {
 	struct {
 		struct in6_addr src;
@@ -2017,19 +2011,17 @@ static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr,
 		struct in6_addr dst;
 		in_port_t dstport;
 	} __attribute__((__packed__)) in = {
-		.srcport = srcport,
-		.dstport = dstport,
+		.src = conn->addr,
+		.srcport = conn->tap_port,
+		.dstport = conn->sock_port,
 	};
 	uint32_t ns, seq = 0;
 
-	if (af == AF_INET) {
-		struct in6_addr tmp;
-		encode_ip4mapped_ip6(&tmp, addr);
-		in.src = tmp;
-		encode_ip4mapped_ip6(&tmp, &c->ip4.addr);
-		in.dst = tmp;
+	if (CONN_V4(conn)) {
+		struct in6_addr v4mapped;
+		encode_ip4mapped_ip6(&v4mapped, &c->ip4.addr);
+		in.dst = v4mapped;
 	} else {
-		in.src = *(struct in6_addr *)addr;
 		in.dst = c->ip6.addr;
 	}
 
@@ -2038,7 +2030,7 @@ static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr,
 	ns = now->tv_sec * 1E9;
 	ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */
 
-	return seq + ns;
+	conn->seq_to_tap = seq + ns;
 }
 
 /**
@@ -2199,7 +2191,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
 	conn->seq_from_tap = conn->seq_init_from_tap + 1;
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
-	conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now);
+	tcp_seq_init(c, conn, now);
 	conn->seq_ack_from_tap = conn->seq_to_tap + 1;
 
 	tcp_hash_insert(c, conn);
@@ -2855,11 +2847,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 
 		conn->sock_port = ntohs(sa6.sin6_port);
 		conn->tap_port = ref.r.p.tcp.tcp.index;
-
-		conn->seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6.sin6_addr,
-						conn->sock_port,
-						conn->tap_port,
-						now);
 	} else {
 		struct sockaddr_in sa4;
 
@@ -2874,13 +2861,9 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 
 		conn->sock_port = ntohs(sa4.sin_port);
 		conn->tap_port = ref.r.p.tcp.tcp.index;
-
-		conn->seq_to_tap = tcp_seq_init(c, AF_INET, &sa4.sin_addr,
-						conn->sock_port,
-						conn->tap_port,
-						now);
 	}
 
+	tcp_seq_init(c, conn, now);
 	tcp_hash_insert(c, conn);
 
 	conn->seq_ack_from_tap = conn->seq_to_tap + 1;
-- 
@@ -1999,17 +1999,11 @@ static void tcp_clamp_window(const struct ctx *c, struct tcp_conn *conn,
 /**
  * tcp_seq_init() - Calculate initial sequence number according to RFC 6528
  * @c:		Execution context
- * @af:		Address family, AF_INET or AF_INET6
- * @addr:	Remote address, pointer to in_addr or in6_addr
- * @dstport:	Destination port, connection-wise, network order
- * @srcport:	Source port, connection-wise, network order
+ * @conn:	TCP connection, with addr, sock_port and tap_port populated
  * @now:	Current timestamp
- *
- * Return: initial TCP sequence
  */
-static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr,
-			     in_port_t dstport, in_port_t srcport,
-			     const struct timespec *now)
+static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn,
+			 const struct timespec *now)
 {
 	struct {
 		struct in6_addr src;
@@ -2017,19 +2011,17 @@ static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr,
 		struct in6_addr dst;
 		in_port_t dstport;
 	} __attribute__((__packed__)) in = {
-		.srcport = srcport,
-		.dstport = dstport,
+		.src = conn->addr,
+		.srcport = conn->tap_port,
+		.dstport = conn->sock_port,
 	};
 	uint32_t ns, seq = 0;
 
-	if (af == AF_INET) {
-		struct in6_addr tmp;
-		encode_ip4mapped_ip6(&tmp, addr);
-		in.src = tmp;
-		encode_ip4mapped_ip6(&tmp, &c->ip4.addr);
-		in.dst = tmp;
+	if (CONN_V4(conn)) {
+		struct in6_addr v4mapped;
+		encode_ip4mapped_ip6(&v4mapped, &c->ip4.addr);
+		in.dst = v4mapped;
 	} else {
-		in.src = *(struct in6_addr *)addr;
 		in.dst = c->ip6.addr;
 	}
 
@@ -2038,7 +2030,7 @@ static uint32_t tcp_seq_init(const struct ctx *c, int af, const void *addr,
 	ns = now->tv_sec * 1E9;
 	ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */
 
-	return seq + ns;
+	conn->seq_to_tap = seq + ns;
 }
 
 /**
@@ -2199,7 +2191,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
 	conn->seq_from_tap = conn->seq_init_from_tap + 1;
 	conn->seq_ack_to_tap = conn->seq_from_tap;
 
-	conn->seq_to_tap = tcp_seq_init(c, af, addr, th->dest, th->source, now);
+	tcp_seq_init(c, conn, now);
 	conn->seq_ack_from_tap = conn->seq_to_tap + 1;
 
 	tcp_hash_insert(c, conn);
@@ -2855,11 +2847,6 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 
 		conn->sock_port = ntohs(sa6.sin6_port);
 		conn->tap_port = ref.r.p.tcp.tcp.index;
-
-		conn->seq_to_tap = tcp_seq_init(c, AF_INET6, &sa6.sin6_addr,
-						conn->sock_port,
-						conn->tap_port,
-						now);
 	} else {
 		struct sockaddr_in sa4;
 
@@ -2874,13 +2861,9 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
 
 		conn->sock_port = ntohs(sa4.sin_port);
 		conn->tap_port = ref.r.p.tcp.tcp.index;
-
-		conn->seq_to_tap = tcp_seq_init(c, AF_INET, &sa4.sin_addr,
-						conn->sock_port,
-						conn->tap_port,
-						now);
 	}
 
+	tcp_seq_init(c, conn, now);
 	tcp_hash_insert(c, conn);
 
 	conn->seq_ack_from_tap = conn->seq_to_tap + 1;
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 10/10] tcp: Fix small error in tcp_seq_init() time handling
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
                   ` (8 preceding siblings ...)
  2022-11-04  8:43 ` [PATCH 09/10] tcp: Have tcp_seq_init() take its parameters from struct tcp_conn David Gibson
@ 2022-11-04  8:43 ` David Gibson
  2022-11-07 18:08   ` Stefano Brivio
  2022-11-04  8:47 ` [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets Stefano Brivio
  10 siblings, 1 reply; 22+ messages in thread
From: David Gibson @ 2022-11-04  8:43 UTC (permalink / raw)
  To: passt-dev, Stefano Brivio; +Cc: David Gibson

It looks like tcp_seq_init() is supposed to advance the sequence number
by one every 32ns.  However we only right shift the ns part of the timespec
not the seconds part, meaning that we'll advance by an extra 32 steps on
each second.

I don't know if that's exploitable in any way, but it doesn't appear to be
the intent, nor what RFC 6528 suggests.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcp.c b/tcp.c
index 59e03ff..941fafb 100644
--- a/tcp.c
+++ b/tcp.c
@@ -2027,8 +2027,8 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn,
 
 	seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
 
-	ns = now->tv_sec * 1E9;
-	ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */
+	/* 32ns ticks, overflows 32 bits every 137s */
+	ns = (now->tv_sec * 1E9 + now->tv_nsec) >> 5;
 
 	conn->seq_to_tap = seq + ns;
 }
-- 
@@ -2027,8 +2027,8 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn,
 
 	seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
 
-	ns = now->tv_sec * 1E9;
-	ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */
+	/* 32ns ticks, overflows 32 bits every 137s */
+	ns = (now->tv_sec * 1E9 + now->tv_nsec) >> 5;
 
 	conn->seq_to_tap = seq + ns;
 }
-- 
2.38.1


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets
  2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
                   ` (9 preceding siblings ...)
  2022-11-04  8:43 ` [PATCH 10/10] tcp: Fix small error in tcp_seq_init() time handling David Gibson
@ 2022-11-04  8:47 ` Stefano Brivio
  10 siblings, 0 replies; 22+ messages in thread
From: Stefano Brivio @ 2022-11-04  8:47 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Fri,  4 Nov 2022 19:43:23 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

>  6 files changed, 116 insertions(+), 137 deletions(-)

I have no idea yet how this is implemented, but the diffstat looks
impressive. :)

-- 
Stefano


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 01/10] tcp: no v6 flag in ref
  2022-11-04  8:43 ` [PATCH 01/10] tcp: no v6 flag in ref David Gibson
@ 2022-11-07 18:07   ` Stefano Brivio
  2022-11-08  0:35     ` David Gibson
  0 siblings, 1 reply; 22+ messages in thread
From: Stefano Brivio @ 2022-11-07 18:07 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Fri,  4 Nov 2022 19:43:24 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> ---
>  tcp.c        |  9 ++++-----
>  tcp.h        |  1 -
>  tcp_splice.c | 13 +++++++------
>  3 files changed, 11 insertions(+), 12 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 713248f..3d48d6e 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -761,8 +761,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn)
>  {
>  	int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
>  	union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
> -				.r.p.tcp.tcp.index = conn - tc,
> -				.r.p.tcp.tcp.v6 = CONN_V6(conn) };
> +				.r.p.tcp.tcp.index = conn - tc, };
>  	struct epoll_event ev = { .data.u64 = ref.u64 };
>  
>  	if (conn->events == CLOSED) {
> @@ -2857,6 +2856,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
>  	if (c->tcp.conn_count >= TCP_MAX_CONNS)
>  		return;
>  
> +	sa.ss_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */

No need for /* FIXME */ here in my opinion, valgrind should also hit
this -- perhaps move to initialiser though.

>  	sl = sizeof(sa);
>  	s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
>  	if (s < 0)
> @@ -2868,7 +2868,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
>  	conn->ws_to_tap = conn->ws_from_tap = 0;
>  	conn_event(c, conn, SOCK_ACCEPTED);
>  
> -	if (ref.r.p.tcp.tcp.v6) {
> +	if (sa.ss_family == AF_INET6) {
>  		struct sockaddr_in6 sa6;
>  
>  		memcpy(&sa6, &sa, sizeof(sa6));
> @@ -3147,8 +3147,7 @@ static void tcp_sock_init6(const struct ctx *c, int ns,
>  			   const struct in6_addr *addr, const char *ifname,
>  			   in_port_t port)
>  {
> -	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns,
> -				     .tcp.v6 = 1 };
> +	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns, };

Undocumented style rule: no comma at the end, it's post-modern C anyway.

>  	bool spliced = false, tap = true;
>  	int s;
>  
> diff --git a/tcp.h b/tcp.h
> index 3fabb5a..85ac750 100644
> --- a/tcp.h
> +++ b/tcp.h
> @@ -45,7 +45,6 @@ union tcp_epoll_ref {
>  		uint32_t	listen:1,
>  				splice:1,
>  				outbound:1,
> -				v6:1,

Don't forget to update the comment above.

>  				timer:1,
>  				index:20;
>  	} tcp;
> diff --git a/tcp_splice.c b/tcp_splice.c
> index 99c5fa7..1f26ff4 100644
> --- a/tcp_splice.c
> +++ b/tcp_splice.c
> @@ -211,12 +211,10 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
>  	int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
>  	union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
>  				  .r.p.tcp.tcp.splice = 1,
> -				  .r.p.tcp.tcp.index = conn - tc,
> -				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
> +				  .r.p.tcp.tcp.index = conn - tc, };
>  	union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b,
>  				  .r.p.tcp.tcp.splice = 1,
> -				  .r.p.tcp.tcp.index = conn - tc,
> -				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
> +				  .r.p.tcp.tcp.index = conn - tc, };

Commas.

>  	struct epoll_event ev_a = { .data.u64 = ref_a.u64 };
>  	struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
>  	uint32_t events_a, events_b;
> @@ -579,12 +577,15 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
>  	struct tcp_splice_conn *conn;
>  
>  	if (ref.r.p.tcp.tcp.listen) {
> +		struct sockaddr sa;

Should this be sockaddr_storage in this case and then cast? I never
remember.

> +		socklen_t sl = sizeof(sa);
>  		int s;
>  
>  		if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS)
>  			return;
>  
> -		if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
> +		sa.sa_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */
> +		if ((s = accept4(ref.r.s, &sa, &sl, SOCK_NONBLOCK)) < 0)

Same comment about /* FIXME */ as above.

This adds a copy_to_user() and microseconds, but it's unavoidable. I
tried harder to optimise for latency on the spliced path, that's why it
was NULL here and not above.

>  			return;
>  
>  		if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }),
> @@ -595,7 +596,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
>  
>  		conn = CONN(c->tcp.splice_conn_count++);
>  		conn->a = s;
> -		conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0;
> +		conn->flags = (sa.sa_family == AF_INET6) ? SOCK_V6 : 0;
>  
>  		if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index,
>  				   ref.r.p.tcp.tcp.outbound))

-- 
Stefano


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 02/10] tcp: Helper to encode IPv4-mapped IPv6 addresses
  2022-11-04  8:43 ` [PATCH 02/10] tcp: Helper to encode IPv4-mapped IPv6 addresses David Gibson
@ 2022-11-07 18:08   ` Stefano Brivio
  2022-11-08  0:46     ` David Gibson
  0 siblings, 1 reply; 22+ messages in thread
From: Stefano Brivio @ 2022-11-07 18:08 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Fri,  4 Nov 2022 19:43:25 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> In the tcp_conn structure we have space for an IPv6 address, and use
> IPv4-mapped IPv6 addresses to describe IPv4 connections.  We open code
> the construction of those IPv4-mapped address in two places.
> 
> Avoid the duplication with a helper function.

Much nicer, indeed.

> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  tcp.c  |  9 ++-------
>  util.c | 20 ++++++++++++++++++++
>  util.h |  1 +
>  3 files changed, 23 insertions(+), 7 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 3d48d6e..26dd268 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -2217,9 +2217,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
>  		sa = (struct sockaddr *)&addr4;
>  		sl = sizeof(addr4);
>  
> -		memset(&conn->a.a4.zero, 0,    sizeof(conn->a.a4.zero));
> -		memset(&conn->a.a4.one,  0xff, sizeof(conn->a.a4.one));
> -		memcpy(&conn->a.a4.a,    addr, sizeof(conn->a.a4.a));
> +		encode_ip4mapped_ip6(&conn->a.a6, addr);
>  	} else {
>  		sa = (struct sockaddr *)&addr6;
>  		sl = sizeof(addr6);
> @@ -2902,15 +2900,12 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
>  
>  		memcpy(&sa4, &sa, sizeof(sa4));
>  
> -		memset(&conn->a.a4.zero,   0, sizeof(conn->a.a4.zero));
> -		memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one));
> -
>  		if (IN4_IS_ADDR_LOOPBACK(&sa4.sin_addr) ||
>  		    IN4_IS_ADDR_UNSPECIFIED(&sa4.sin_addr) ||
>  		    IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen))
>  			sa4.sin_addr = c->ip4.gw;
>  
> -		conn->a.a4.a = sa4.sin_addr;
> +		encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr);
>  
>  		conn->sock_port = ntohs(sa4.sin_port);
>  		conn->tap_port = ref.r.p.tcp.tcp.index;
> diff --git a/util.c b/util.c
> index 514bd44..257d0b6 100644
> --- a/util.c
> +++ b/util.c
> @@ -482,3 +482,23 @@ int write_file(const char *path, const char *buf)
>  	close(fd);
>  	return len == 0 ? 0 : -1;
>  }
> +
> +struct ip4mapped_ip6 {
> +	uint8_t zero[10];
> +	uint8_t one[2];
> +	struct in_addr a4;
> +};

Document fields even if they're obvious. Can we reuse this part in
struct conn instead of using struct in6_addr there as you do later in
7/10? I don't have a strong preference though.

> +
> +/**
> + * encode_ip4mapped_ip6() - Convert an IPv4 address to an IPv4-mapped IPv6 address
> + * @ip6:	Buffer to store the IPv4-mapped IPv6 address
> + * @ip4:	IPv4 address, network order
> + */
> +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4)
> +{
> +	struct ip4mapped_ip6 *a = (struct ip4mapped_ip6 *)ip6;
> +
> +	memset(&a->zero, 0, sizeof(a->zero));
> +	memset(&a->one, 0xff, sizeof(a->one));
> +	memcpy(&a->a4, ip4, sizeof(a->a4));
> +}
> diff --git a/util.h b/util.h
> index 2d4e1ff..f7d6a6f 100644
> --- a/util.h
> +++ b/util.h
> @@ -209,5 +209,6 @@ void write_pidfile(int fd, pid_t pid);
>  int __daemon(int pidfile_fd, int devnull_fd);
>  int fls(unsigned long x);
>  int write_file(const char *path, const char *buf);
> +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4);
>  
>  #endif /* UTIL_H */

-- 
Stefano


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 03/10] tcp: Partially unify IPv4 and IPv6 paths in tcp_hash_match()
  2022-11-04  8:43 ` [PATCH 03/10] tcp: Partially unify IPv4 and IPv6 paths in tcp_hash_match() David Gibson
@ 2022-11-07 18:08   ` Stefano Brivio
  2022-11-08  0:51     ` David Gibson
  0 siblings, 1 reply; 22+ messages in thread
From: Stefano Brivio @ 2022-11-07 18:08 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Fri,  4 Nov 2022 19:43:26 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> When given an IPv4 address tcp_hash_match() checks if the connection
> stores an IPv4-mapped IPv6 address, and if so compares the mapped part of
> that address to the given address.  This is equivalent to converting a
> given IPv4 address to an IPv4-mapped IPv6 address then comparing it to the
> connection address using the existing IPv6 logic.
> 
> Convert to this slightly simpler form, which will also allow some further
> simplifications in future.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  tcp.c | 13 +++++++------
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 26dd268..508d6e9 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1285,13 +1285,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
>  static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
>  			  in_port_t tap_port, in_port_t sock_port)
>  {
> -	if (af == AF_INET && CONN_V4(conn)			&&
> -	    !memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a))	&&
> -	    conn->tap_port == tap_port && conn->sock_port == sock_port)
> -		return 1;
> +	struct in6_addr v4mapped;
> +
> +	if (af == AF_INET) {
> +		encode_ip4mapped_ip6(&v4mapped, addr);
> +		addr = &v4mapped;
> +	}

It's probably worth the simplification, just mind that this replaces a
32-bit comparison with 128 bits of writes.

>  
> -	if (af == AF_INET6					&&
> -	    IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
> +	if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
>  	    conn->tap_port == tap_port && conn->sock_port == sock_port)
>  		return 1;
>  

-- 
Stefano


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 07/10] tcp: Remove ugly address union from struct tcp_conn
  2022-11-04  8:43 ` [PATCH 07/10] tcp: Remove ugly address union from struct tcp_conn David Gibson
@ 2022-11-07 18:08   ` Stefano Brivio
  2022-11-08  0:54     ` David Gibson
  0 siblings, 1 reply; 22+ messages in thread
From: Stefano Brivio @ 2022-11-07 18:08 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Fri,  4 Nov 2022 19:43:30 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> We now only extract the v4 part of the tcp_conn::a union in one place.
> Make this neater by using a helper to extract the IPv4 address from the
> mapped address in that place.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  tcp.c  | 40 +++++++++++++++-------------------------
>  util.c | 13 +++++++++++++
>  util.h |  1 +
>  3 files changed, 29 insertions(+), 25 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 3816a1c..6634abb 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -416,10 +416,7 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
>   * @ws_to_tap:		Window scaling factor advertised to tap/guest
>   * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
>   * @seq_dup_ack_approx:	Last duplicate ACK number sent to tap
> - * @a.a6:		IPv6 remote address, can be IPv4-mapped
> - * @a.a4.zero:		Zero prefix for IPv4-mapped, see RFC 6890, Table 20
> - * @a.a4.one:		Ones prefix for IPv4-mapped
> - * @a.a4.a:		IPv4 address
> + * @addr:		IPv6 remote address, can be IPv4-mapped
>   * @tap_port:		Guest-facing tap port
>   * @sock_port:		Remote, socket-facing port
>   * @wnd_from_tap:	Last window size from tap, unscaled (as received)
> @@ -489,15 +486,8 @@ struct tcp_conn {
>  	uint8_t		seq_dup_ack_approx;
>  
>  
> -	union {
> -		struct in6_addr a6;
> -		struct {
> -			uint8_t zero[10];
> -			uint8_t one[2];
> -			struct in_addr a;
> -		} a4;
> -	} a;
> -#define CONN_V4(conn)		IN6_IS_ADDR_V4MAPPED(&conn->a.a6)
> +	struct in6_addr addr;
> +#define CONN_V4(conn)		IN6_IS_ADDR_V4MAPPED(&conn->addr)
>  #define CONN_V6(conn)		(!CONN_V4(conn))
>  
>  	in_port_t	tap_port;
> @@ -960,7 +950,7 @@ static int tcp_rtt_dst_low(const struct tcp_conn *conn)
>  	int i;
>  
>  	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
> -		if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
> +		if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i))
>  			return 1;
>  
>  	return 0;
> @@ -982,7 +972,7 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn,
>  		return;
>  
>  	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
> -		if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
> +		if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i))
>  			return;
>  		if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
>  			hole = i;
> @@ -994,10 +984,10 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn,
>  	if (hole == -1)
>  		return;
>  
> -	memcpy(low_rtt_dst + hole++, &conn->a.a6, sizeof(conn->a.a6));
> +	memcpy(low_rtt_dst + hole++, &conn->addr, sizeof(conn->addr));
>  	if (hole == LOW_RTT_TABLE_SIZE)
>  		hole = 0;
> -	memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->a.a6));
> +	memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->addr));
>  #else
>  	(void)conn;
>  	(void)tinfo;
> @@ -1285,7 +1275,7 @@ static int tcp_hash_match(const struct tcp_conn *conn,
>  			  const struct in6_addr *addr,
>  			  in_port_t tap_port, in_port_t sock_port)
>  {
> -	if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
> +	if (IN6_ARE_ADDR_EQUAL(&conn->addr, addr)		&&
>  	    conn->tap_port == tap_port && conn->sock_port == sock_port)
>  		return 1;
>  
> @@ -1330,7 +1320,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn)
>  {
>  	int b;
>  
> -	b = tcp_hash(c, &conn->a.a6, conn->tap_port, conn->sock_port);
> +	b = tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port);
>  	conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1;
>  	tc_hash[b] = conn;
>  	conn->hash_bucket = b;
> @@ -1660,7 +1650,7 @@ do {									\
>  		ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
>  
>  		b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr));
> -		b->ip6h.saddr = conn->a.a6;
> +		b->ip6h.saddr = conn->addr;
>  		if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr))
>  			b->ip6h.daddr = c->ip6.addr_ll_seen;
>  		else
> @@ -1684,7 +1674,7 @@ do {									\
>  
>  		ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr);
>  		b->iph.tot_len = htons(ip_len);
> -		b->iph.saddr = conn->a.a4.a.s_addr;
> +		b->iph.saddr = decode_ip4mapped_ip6(&conn->addr).s_addr;
>  		b->iph.daddr = c->ip4.addr_seen.s_addr;
>  
>  		if (check)
> @@ -2202,12 +2192,12 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
>  		sa = (struct sockaddr *)&addr4;
>  		sl = sizeof(addr4);
>  
> -		encode_ip4mapped_ip6(&conn->a.a6, addr);
> +		encode_ip4mapped_ip6(&conn->addr, addr);
>  	} else {
>  		sa = (struct sockaddr *)&addr6;
>  		sl = sizeof(addr6);
>  
> -		memcpy(&conn->a.a6,      addr, sizeof(conn->a.a6));
> +		memcpy(&conn->addr,      addr, sizeof(conn->addr));
>  	}
>  
>  	conn->sock_port = ntohs(th->dest);
> @@ -2869,7 +2859,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
>  			memcpy(&sa6.sin6_addr, src, sizeof(*src));
>  		}
>  
> -		memcpy(&conn->a.a6, &sa6.sin6_addr, sizeof(conn->a.a6));
> +		memcpy(&conn->addr, &sa6.sin6_addr, sizeof(conn->addr));
>  
>  		conn->sock_port = ntohs(sa6.sin6_port);
>  		conn->tap_port = ref.r.p.tcp.tcp.index;
> @@ -2888,7 +2878,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
>  		    IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen))
>  			sa4.sin_addr = c->ip4.gw;
>  
> -		encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr);
> +		encode_ip4mapped_ip6(&conn->addr, &sa4.sin_addr);
>  
>  		conn->sock_port = ntohs(sa4.sin_port);
>  		conn->tap_port = ref.r.p.tcp.tcp.index;
> diff --git a/util.c b/util.c
> index 257d0b6..ec7f57f 100644
> --- a/util.c
> +++ b/util.c
> @@ -502,3 +502,16 @@ void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4)
>  	memset(&a->one, 0xff, sizeof(a->one));
>  	memcpy(&a->a4, ip4, sizeof(a->a4));
>  }
> +
> +/**
> + * decode_ip4mapped_ip6() - Decode an IPv4-mapped IPv6 address
> + * @ip6:	IPv6 address
> + *
> + * Returns:	IPv4 address

Return: IPv4 address

> + */
> +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6)
> +{
> +	const struct ip4mapped_ip6 *a = (const struct ip4mapped_ip6 *)ip6;

...if you use struct ip4_mapped_ip6 directly in struct conn, you could
drop this cast, and perhaps this helper too, but I'm not sure if it has
other implications.

> +
> +	return a->a4;
> +}
> diff --git a/util.h b/util.h
> index f7d6a6f..103211d 100644
> --- a/util.h
> +++ b/util.h
> @@ -210,5 +210,6 @@ int __daemon(int pidfile_fd, int devnull_fd);
>  int fls(unsigned long x);
>  int write_file(const char *path, const char *buf);
>  void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4);
> +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6);
>  
>  #endif /* UTIL_H */

-- 
Stefano


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 10/10] tcp: Fix small error in tcp_seq_init() time handling
  2022-11-04  8:43 ` [PATCH 10/10] tcp: Fix small error in tcp_seq_init() time handling David Gibson
@ 2022-11-07 18:08   ` Stefano Brivio
  2022-11-08  0:59     ` David Gibson
  0 siblings, 1 reply; 22+ messages in thread
From: Stefano Brivio @ 2022-11-07 18:08 UTC (permalink / raw)
  To: David Gibson; +Cc: passt-dev

On Fri,  4 Nov 2022 19:43:33 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> It looks like tcp_seq_init() is supposed to advance the sequence number
> by one every 32ns.  However we only right shift the ns part of the timespec
> not the seconds part, meaning that we'll advance by an extra 32 steps on
> each second.
> 
> I don't know if that's exploitable in any way, but it doesn't appear to be
> the intent, nor what RFC 6528 suggests.

Oh, oops, nice catch.

Well, as long as the step, modulo 32 bits, is not 0, it's still
arguably the 250 KHz / 4 µs period clock from RFC 793, so there's no
practical difference (other than the overflow period). See also the
note in RFC 1948:

   More precisely, RFC 793 specifies that the 32-bit counter be
   incremented by 1 in the low-order position about every 4
   microseconds.  Instead, Berkeley-derived kernels increment it by a
   constant every second [...]

I kind of wonder if adding a time non-linearity like the unintended one
I added actually increases entropy.

But indeed ~4 seconds overflow is also not intended, and we should just
stick to RFC 6528.

> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  tcp.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/tcp.c b/tcp.c
> index 59e03ff..941fafb 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -2027,8 +2027,8 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn,
>  
>  	seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
>  
> -	ns = now->tv_sec * 1E9;
> -	ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */
> +	/* 32ns ticks, overflows 32 bits every 137s */
> +	ns = (now->tv_sec * 1E9 + now->tv_nsec) >> 5;
>  
>  	conn->seq_to_tap = seq + ns;
>  }

-- 
Stefano


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 01/10] tcp: no v6 flag in ref
  2022-11-07 18:07   ` Stefano Brivio
@ 2022-11-08  0:35     ` David Gibson
  0 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-08  0:35 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 6087 bytes --]

On Mon, Nov 07, 2022 at 07:07:53PM +0100, Stefano Brivio wrote:
> On Fri,  4 Nov 2022 19:43:24 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > ---
> >  tcp.c        |  9 ++++-----
> >  tcp.h        |  1 -
> >  tcp_splice.c | 13 +++++++------
> >  3 files changed, 11 insertions(+), 12 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 713248f..3d48d6e 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -761,8 +761,7 @@ static int tcp_epoll_ctl(const struct ctx *c, struct tcp_conn *conn)
> >  {
> >  	int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
> >  	union epoll_ref ref = { .r.proto = IPPROTO_TCP, .r.s = conn->sock,
> > -				.r.p.tcp.tcp.index = conn - tc,
> > -				.r.p.tcp.tcp.v6 = CONN_V6(conn) };
> > +				.r.p.tcp.tcp.index = conn - tc, };
> >  	struct epoll_event ev = { .data.u64 = ref.u64 };
> >  
> >  	if (conn->events == CLOSED) {
> > @@ -2857,6 +2856,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
> >  	if (c->tcp.conn_count >= TCP_MAX_CONNS)
> >  		return;
> >  
> > +	sa.ss_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */
> 
> No need for /* FIXME */ here in my opinion,

"FIXME" maybe isn't quite the right sentiment.  The issue here is I
don't think this should be necessary.  AFAIK sa.ss_family will be
written on any succesful return from accept4(), it's just the
clang-tidy doesn't seem to know that.

> valgrind should also hit
> this

I'm not sure.  Depends whether valgrind's knowledge of syscall
behaviours has the same bug or not.

> -- perhaps move to initialiser though.
> 
> >  	sl = sizeof(sa);
> >  	s = accept4(ref.r.s, (struct sockaddr *)&sa, &sl, SOCK_NONBLOCK);
> >  	if (s < 0)
> > @@ -2868,7 +2868,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
> >  	conn->ws_to_tap = conn->ws_from_tap = 0;
> >  	conn_event(c, conn, SOCK_ACCEPTED);
> >  
> > -	if (ref.r.p.tcp.tcp.v6) {
> > +	if (sa.ss_family == AF_INET6) {
> >  		struct sockaddr_in6 sa6;
> >  
> >  		memcpy(&sa6, &sa, sizeof(sa6));
> > @@ -3147,8 +3147,7 @@ static void tcp_sock_init6(const struct ctx *c, int ns,
> >  			   const struct in6_addr *addr, const char *ifname,
> >  			   in_port_t port)
> >  {
> > -	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns,
> > -				     .tcp.v6 = 1 };
> > +	union tcp_epoll_ref tref = { .tcp.listen = 1, .tcp.outbound = ns, };
> 
> Undocumented style rule: no comma at the end, it's post-modern C anyway.

Fixed.

> >  	bool spliced = false, tap = true;
> >  	int s;
> >  
> > diff --git a/tcp.h b/tcp.h
> > index 3fabb5a..85ac750 100644
> > --- a/tcp.h
> > +++ b/tcp.h
> > @@ -45,7 +45,6 @@ union tcp_epoll_ref {
> >  		uint32_t	listen:1,
> >  				splice:1,
> >  				outbound:1,
> > -				v6:1,
> 
> Don't forget to update the comment above.

Fixed.

> >  				timer:1,
> >  				index:20;
> >  	} tcp;
> > diff --git a/tcp_splice.c b/tcp_splice.c
> > index 99c5fa7..1f26ff4 100644
> > --- a/tcp_splice.c
> > +++ b/tcp_splice.c
> > @@ -211,12 +211,10 @@ static int tcp_splice_epoll_ctl(const struct ctx *c,
> >  	int m = (conn->flags & IN_EPOLL) ? EPOLL_CTL_MOD : EPOLL_CTL_ADD;
> >  	union epoll_ref ref_a = { .r.proto = IPPROTO_TCP, .r.s = conn->a,
> >  				  .r.p.tcp.tcp.splice = 1,
> > -				  .r.p.tcp.tcp.index = conn - tc,
> > -				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
> > +				  .r.p.tcp.tcp.index = conn - tc, };
> >  	union epoll_ref ref_b = { .r.proto = IPPROTO_TCP, .r.s = conn->b,
> >  				  .r.p.tcp.tcp.splice = 1,
> > -				  .r.p.tcp.tcp.index = conn - tc,
> > -				  .r.p.tcp.tcp.v6 = CONN_V6(conn) };
> > +				  .r.p.tcp.tcp.index = conn - tc, };
> 
> Commas.

Fixed.

> 
> >  	struct epoll_event ev_a = { .data.u64 = ref_a.u64 };
> >  	struct epoll_event ev_b = { .data.u64 = ref_b.u64 };
> >  	uint32_t events_a, events_b;
> > @@ -579,12 +577,15 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
> >  	struct tcp_splice_conn *conn;
> >  
> >  	if (ref.r.p.tcp.tcp.listen) {
> > +		struct sockaddr sa;
> 
> Should this be sockaddr_storage in this case and then cast? I never
> remember.

So, for this patch, no it doesn't, but it will have to change later,
in fact.  The trick here is that we only want the sa_family field, we
don't need the full socket address.  struct sockaddr has that (and
maybe a little extra, but not much).  accept4() will truncate the
socket address when it writes, but that's ok for our purposes.  Using
the short form may mitigate the concern below because there's less
data to copy_to_user().

Of course, when I change this later to add different handling for
IPv4-mapped IPv6 source addresses we will need the full address, so
this will need to be a sockaddr_in6.

> > +		socklen_t sl = sizeof(sa);
> >  		int s;
> >  
> >  		if (c->tcp.splice_conn_count >= TCP_SPLICE_MAX_CONNS)
> >  			return;
> >  
> > -		if ((s = accept4(ref.r.s, NULL, NULL, SOCK_NONBLOCK)) < 0)
> > +		sa.sa_family = AF_UNSPEC; /* FIXME: stop clang-tidy complaining */
> > +		if ((s = accept4(ref.r.s, &sa, &sl, SOCK_NONBLOCK)) < 0)
> 
> Same comment about /* FIXME */ as above.
> 
> This adds a copy_to_user() and microseconds, but it's unavoidable. I
> tried harder to optimise for latency on the spliced path, that's why it
> was NULL here and not above.
> 
> >  			return;
> >  
> >  		if (setsockopt(s, SOL_TCP, TCP_QUICKACK, &((int){ 1 }),
> > @@ -595,7 +596,7 @@ void tcp_sock_handler_splice(struct ctx *c, union epoll_ref ref,
> >  
> >  		conn = CONN(c->tcp.splice_conn_count++);
> >  		conn->a = s;
> > -		conn->flags = ref.r.p.tcp.tcp.v6 ? SOCK_V6 : 0;
> > +		conn->flags = (sa.sa_family == AF_INET6) ? SOCK_V6 : 0;
> >  
> >  		if (tcp_splice_new(c, conn, ref.r.p.tcp.tcp.index,
> >  				   ref.r.p.tcp.tcp.outbound))
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 02/10] tcp: Helper to encode IPv4-mapped IPv6 addresses
  2022-11-07 18:08   ` Stefano Brivio
@ 2022-11-08  0:46     ` David Gibson
  0 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-08  0:46 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 4059 bytes --]

On Mon, Nov 07, 2022 at 07:08:19PM +0100, Stefano Brivio wrote:
> On Fri,  4 Nov 2022 19:43:25 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > In the tcp_conn structure we have space for an IPv6 address, and use
> > IPv4-mapped IPv6 addresses to describe IPv4 connections.  We open code
> > the construction of those IPv4-mapped address in two places.
> > 
> > Avoid the duplication with a helper function.
> 
> Much nicer, indeed.
> 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  tcp.c  |  9 ++-------
> >  util.c | 20 ++++++++++++++++++++
> >  util.h |  1 +
> >  3 files changed, 23 insertions(+), 7 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 3d48d6e..26dd268 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -2217,9 +2217,7 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
> >  		sa = (struct sockaddr *)&addr4;
> >  		sl = sizeof(addr4);
> >  
> > -		memset(&conn->a.a4.zero, 0,    sizeof(conn->a.a4.zero));
> > -		memset(&conn->a.a4.one,  0xff, sizeof(conn->a.a4.one));
> > -		memcpy(&conn->a.a4.a,    addr, sizeof(conn->a.a4.a));
> > +		encode_ip4mapped_ip6(&conn->a.a6, addr);
> >  	} else {
> >  		sa = (struct sockaddr *)&addr6;
> >  		sl = sizeof(addr6);
> > @@ -2902,15 +2900,12 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
> >  
> >  		memcpy(&sa4, &sa, sizeof(sa4));
> >  
> > -		memset(&conn->a.a4.zero,   0, sizeof(conn->a.a4.zero));
> > -		memset(&conn->a.a4.one, 0xff, sizeof(conn->a.a4.one));
> > -
> >  		if (IN4_IS_ADDR_LOOPBACK(&sa4.sin_addr) ||
> >  		    IN4_IS_ADDR_UNSPECIFIED(&sa4.sin_addr) ||
> >  		    IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen))
> >  			sa4.sin_addr = c->ip4.gw;
> >  
> > -		conn->a.a4.a = sa4.sin_addr;
> > +		encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr);
> >  
> >  		conn->sock_port = ntohs(sa4.sin_port);
> >  		conn->tap_port = ref.r.p.tcp.tcp.index;
> > diff --git a/util.c b/util.c
> > index 514bd44..257d0b6 100644
> > --- a/util.c
> > +++ b/util.c
> > @@ -482,3 +482,23 @@ int write_file(const char *path, const char *buf)
> >  	close(fd);
> >  	return len == 0 ? 0 : -1;
> >  }
> > +
> > +struct ip4mapped_ip6 {
> > +	uint8_t zero[10];
> > +	uint8_t one[2];
> > +	struct in_addr a4;
> > +};
> 
> Document fields even if they're obvious. Can we reuse this part in
> struct conn instead of using struct in6_addr there as you do later in
> 7/10? I don't have a strong preference though.

Yeah, we could.  Originally I was trying to make this even more
hidden, with the decode function returning a struct in_addr *, that
would be NULL if the IPv6 address wasn't v4-mapped.  Unfortunately I
seemed to hit more weird compiler aliasing issues that caused hard to
predict failures.

Hrm... your comment gives me some thoughts on maybe a better way to do
this, I'll look into it.

> 
> > +
> > +/**
> > + * encode_ip4mapped_ip6() - Convert an IPv4 address to an IPv4-mapped IPv6 address
> > + * @ip6:	Buffer to store the IPv4-mapped IPv6 address
> > + * @ip4:	IPv4 address, network order
> > + */
> > +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4)
> > +{
> > +	struct ip4mapped_ip6 *a = (struct ip4mapped_ip6 *)ip6;
> > +
> > +	memset(&a->zero, 0, sizeof(a->zero));
> > +	memset(&a->one, 0xff, sizeof(a->one));
> > +	memcpy(&a->a4, ip4, sizeof(a->a4));
> > +}
> > diff --git a/util.h b/util.h
> > index 2d4e1ff..f7d6a6f 100644
> > --- a/util.h
> > +++ b/util.h
> > @@ -209,5 +209,6 @@ void write_pidfile(int fd, pid_t pid);
> >  int __daemon(int pidfile_fd, int devnull_fd);
> >  int fls(unsigned long x);
> >  int write_file(const char *path, const char *buf);
> > +void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4);
> >  
> >  #endif /* UTIL_H */
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 03/10] tcp: Partially unify IPv4 and IPv6 paths in tcp_hash_match()
  2022-11-07 18:08   ` Stefano Brivio
@ 2022-11-08  0:51     ` David Gibson
  0 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-08  0:51 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 2467 bytes --]

On Mon, Nov 07, 2022 at 07:08:27PM +0100, Stefano Brivio wrote:
> On Fri,  4 Nov 2022 19:43:26 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > When given an IPv4 address tcp_hash_match() checks if the connection
> > stores an IPv4-mapped IPv6 address, and if so compares the mapped part of
> > that address to the given address.  This is equivalent to converting a
> > given IPv4 address to an IPv4-mapped IPv6 address then comparing it to the
> > connection address using the existing IPv6 logic.
> > 
> > Convert to this slightly simpler form, which will also allow some further
> > simplifications in future.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  tcp.c | 13 +++++++------
> >  1 file changed, 7 insertions(+), 6 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 26dd268..508d6e9 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -1285,13 +1285,14 @@ static int tcp_opt_get(const char *opts, size_t len, uint8_t type_find,
> >  static int tcp_hash_match(const struct tcp_conn *conn, int af, const void *addr,
> >  			  in_port_t tap_port, in_port_t sock_port)
> >  {
> > -	if (af == AF_INET && CONN_V4(conn)			&&
> > -	    !memcmp(&conn->a.a4.a, addr, sizeof(conn->a.a4.a))	&&
> > -	    conn->tap_port == tap_port && conn->sock_port == sock_port)
> > -		return 1;
> > +	struct in6_addr v4mapped;
> > +
> > +	if (af == AF_INET) {
> > +		encode_ip4mapped_ip6(&v4mapped, addr);
> > +		addr = &v4mapped;
> > +	}
> 
> It's probably worth the simplification, just mind that this replaces a
> 32-bit comparison with 128 bits of writes.

I'm aware, however this is mitigated later in the series.  Once I make
tcp_hash_match() take a (possibly v4mapped) v6 address only it's just
a 128-bit comparison, which we already effectively do split between
CONN_V4() and the v4 specific path.  Obviously that means constructing
the v4mapped address in the caller, but we were already doing that in
at least one of the cases.

> 
> >  
> > -	if (af == AF_INET6					&&
> > -	    IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
> > +	if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
> >  	    conn->tap_port == tap_port && conn->sock_port == sock_port)
> >  		return 1;
> >  
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 07/10] tcp: Remove ugly address union from struct tcp_conn
  2022-11-07 18:08   ` Stefano Brivio
@ 2022-11-08  0:54     ` David Gibson
  0 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-08  0:54 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 7729 bytes --]

On Mon, Nov 07, 2022 at 07:08:38PM +0100, Stefano Brivio wrote:
> On Fri,  4 Nov 2022 19:43:30 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > We now only extract the v4 part of the tcp_conn::a union in one place.
> > Make this neater by using a helper to extract the IPv4 address from the
> > mapped address in that place.
> > 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  tcp.c  | 40 +++++++++++++++-------------------------
> >  util.c | 13 +++++++++++++
> >  util.h |  1 +
> >  3 files changed, 29 insertions(+), 25 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 3816a1c..6634abb 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -416,10 +416,7 @@ struct tcp6_l2_head {	/* For MSS6 macro: keep in sync with tcp6_l2_buf_t */
> >   * @ws_to_tap:		Window scaling factor advertised to tap/guest
> >   * @sndbuf:		Sending buffer in kernel, rounded to 2 ^ SNDBUF_BITS
> >   * @seq_dup_ack_approx:	Last duplicate ACK number sent to tap
> > - * @a.a6:		IPv6 remote address, can be IPv4-mapped
> > - * @a.a4.zero:		Zero prefix for IPv4-mapped, see RFC 6890, Table 20
> > - * @a.a4.one:		Ones prefix for IPv4-mapped
> > - * @a.a4.a:		IPv4 address
> > + * @addr:		IPv6 remote address, can be IPv4-mapped
> >   * @tap_port:		Guest-facing tap port
> >   * @sock_port:		Remote, socket-facing port
> >   * @wnd_from_tap:	Last window size from tap, unscaled (as received)
> > @@ -489,15 +486,8 @@ struct tcp_conn {
> >  	uint8_t		seq_dup_ack_approx;
> >  
> >  
> > -	union {
> > -		struct in6_addr a6;
> > -		struct {
> > -			uint8_t zero[10];
> > -			uint8_t one[2];
> > -			struct in_addr a;
> > -		} a4;
> > -	} a;
> > -#define CONN_V4(conn)		IN6_IS_ADDR_V4MAPPED(&conn->a.a6)
> > +	struct in6_addr addr;
> > +#define CONN_V4(conn)		IN6_IS_ADDR_V4MAPPED(&conn->addr)
> >  #define CONN_V6(conn)		(!CONN_V4(conn))
> >  
> >  	in_port_t	tap_port;
> > @@ -960,7 +950,7 @@ static int tcp_rtt_dst_low(const struct tcp_conn *conn)
> >  	int i;
> >  
> >  	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++)
> > -		if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
> > +		if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i))
> >  			return 1;
> >  
> >  	return 0;
> > @@ -982,7 +972,7 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn,
> >  		return;
> >  
> >  	for (i = 0; i < LOW_RTT_TABLE_SIZE; i++) {
> > -		if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, low_rtt_dst + i))
> > +		if (IN6_ARE_ADDR_EQUAL(&conn->addr, low_rtt_dst + i))
> >  			return;
> >  		if (hole == -1 && IN6_IS_ADDR_UNSPECIFIED(low_rtt_dst + i))
> >  			hole = i;
> > @@ -994,10 +984,10 @@ static void tcp_rtt_dst_check(const struct tcp_conn *conn,
> >  	if (hole == -1)
> >  		return;
> >  
> > -	memcpy(low_rtt_dst + hole++, &conn->a.a6, sizeof(conn->a.a6));
> > +	memcpy(low_rtt_dst + hole++, &conn->addr, sizeof(conn->addr));
> >  	if (hole == LOW_RTT_TABLE_SIZE)
> >  		hole = 0;
> > -	memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->a.a6));
> > +	memcpy(low_rtt_dst + hole, &in6addr_any, sizeof(conn->addr));
> >  #else
> >  	(void)conn;
> >  	(void)tinfo;
> > @@ -1285,7 +1275,7 @@ static int tcp_hash_match(const struct tcp_conn *conn,
> >  			  const struct in6_addr *addr,
> >  			  in_port_t tap_port, in_port_t sock_port)
> >  {
> > -	if (IN6_ARE_ADDR_EQUAL(&conn->a.a6, addr)		&&
> > +	if (IN6_ARE_ADDR_EQUAL(&conn->addr, addr)		&&
> >  	    conn->tap_port == tap_port && conn->sock_port == sock_port)
> >  		return 1;
> >  
> > @@ -1330,7 +1320,7 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_conn *conn)
> >  {
> >  	int b;
> >  
> > -	b = tcp_hash(c, &conn->a.a6, conn->tap_port, conn->sock_port);
> > +	b = tcp_hash(c, &conn->addr, conn->tap_port, conn->sock_port);
> >  	conn->next_index = tc_hash[b] ? tc_hash[b] - tc : -1;
> >  	tc_hash[b] = conn;
> >  	conn->hash_bucket = b;
> > @@ -1660,7 +1650,7 @@ do {									\
> >  		ip_len = plen + sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
> >  
> >  		b->ip6h.payload_len = htons(plen + sizeof(struct tcphdr));
> > -		b->ip6h.saddr = conn->a.a6;
> > +		b->ip6h.saddr = conn->addr;
> >  		if (IN6_IS_ADDR_LINKLOCAL(&b->ip6h.saddr))
> >  			b->ip6h.daddr = c->ip6.addr_ll_seen;
> >  		else
> > @@ -1684,7 +1674,7 @@ do {									\
> >  
> >  		ip_len = plen + sizeof(struct iphdr) + sizeof(struct tcphdr);
> >  		b->iph.tot_len = htons(ip_len);
> > -		b->iph.saddr = conn->a.a4.a.s_addr;
> > +		b->iph.saddr = decode_ip4mapped_ip6(&conn->addr).s_addr;
> >  		b->iph.daddr = c->ip4.addr_seen.s_addr;
> >  
> >  		if (check)
> > @@ -2202,12 +2192,12 @@ static void tcp_conn_from_tap(struct ctx *c, int af, const void *addr,
> >  		sa = (struct sockaddr *)&addr4;
> >  		sl = sizeof(addr4);
> >  
> > -		encode_ip4mapped_ip6(&conn->a.a6, addr);
> > +		encode_ip4mapped_ip6(&conn->addr, addr);
> >  	} else {
> >  		sa = (struct sockaddr *)&addr6;
> >  		sl = sizeof(addr6);
> >  
> > -		memcpy(&conn->a.a6,      addr, sizeof(conn->a.a6));
> > +		memcpy(&conn->addr,      addr, sizeof(conn->addr));
> >  	}
> >  
> >  	conn->sock_port = ntohs(th->dest);
> > @@ -2869,7 +2859,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
> >  			memcpy(&sa6.sin6_addr, src, sizeof(*src));
> >  		}
> >  
> > -		memcpy(&conn->a.a6, &sa6.sin6_addr, sizeof(conn->a.a6));
> > +		memcpy(&conn->addr, &sa6.sin6_addr, sizeof(conn->addr));
> >  
> >  		conn->sock_port = ntohs(sa6.sin6_port);
> >  		conn->tap_port = ref.r.p.tcp.tcp.index;
> > @@ -2888,7 +2878,7 @@ static void tcp_conn_from_sock(struct ctx *c, union epoll_ref ref,
> >  		    IN4_ARE_ADDR_EQUAL(&sa4.sin_addr, &c->ip4.addr_seen))
> >  			sa4.sin_addr = c->ip4.gw;
> >  
> > -		encode_ip4mapped_ip6(&conn->a.a6, &sa4.sin_addr);
> > +		encode_ip4mapped_ip6(&conn->addr, &sa4.sin_addr);
> >  
> >  		conn->sock_port = ntohs(sa4.sin_port);
> >  		conn->tap_port = ref.r.p.tcp.tcp.index;
> > diff --git a/util.c b/util.c
> > index 257d0b6..ec7f57f 100644
> > --- a/util.c
> > +++ b/util.c
> > @@ -502,3 +502,16 @@ void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4)
> >  	memset(&a->one, 0xff, sizeof(a->one));
> >  	memcpy(&a->a4, ip4, sizeof(a->a4));
> >  }
> > +
> > +/**
> > + * decode_ip4mapped_ip6() - Decode an IPv4-mapped IPv6 address
> > + * @ip6:	IPv6 address
> > + *
> > + * Returns:	IPv4 address
> 
> Return: IPv4 address

Fixed.  I note there are a few other instances of this mistake in the
code, most by fault by the looks of it.  I'll try to fix those others
at some point.

> > + */
> > +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6)
> > +{
> > +	const struct ip4mapped_ip6 *a = (const struct ip4mapped_ip6 *)ip6;
> 
> ...if you use struct ip4_mapped_ip6 directly in struct conn, you could
> drop this cast, and perhaps this helper too, but I'm not sure if it has
> other implications.

True.  As I said, I'm considering a slightly different approach now.

> 
> > +
> > +	return a->a4;
> > +}
> > diff --git a/util.h b/util.h
> > index f7d6a6f..103211d 100644
> > --- a/util.h
> > +++ b/util.h
> > @@ -210,5 +210,6 @@ int __daemon(int pidfile_fd, int devnull_fd);
> >  int fls(unsigned long x);
> >  int write_file(const char *path, const char *buf);
> >  void encode_ip4mapped_ip6(struct in6_addr *ip6, const void *ip4);
> > +struct in_addr decode_ip4mapped_ip6(const struct in6_addr *ip6);
> >  
> >  #endif /* UTIL_H */
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 10/10] tcp: Fix small error in tcp_seq_init() time handling
  2022-11-07 18:08   ` Stefano Brivio
@ 2022-11-08  0:59     ` David Gibson
  0 siblings, 0 replies; 22+ messages in thread
From: David Gibson @ 2022-11-08  0:59 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: passt-dev

[-- Attachment #1: Type: text/plain, Size: 2466 bytes --]

On Mon, Nov 07, 2022 at 07:08:46PM +0100, Stefano Brivio wrote:
> On Fri,  4 Nov 2022 19:43:33 +1100
> David Gibson <david@gibson.dropbear.id.au> wrote:
> 
> > It looks like tcp_seq_init() is supposed to advance the sequence number
> > by one every 32ns.  However we only right shift the ns part of the timespec
> > not the seconds part, meaning that we'll advance by an extra 32 steps on
> > each second.
> > 
> > I don't know if that's exploitable in any way, but it doesn't appear to be
> > the intent, nor what RFC 6528 suggests.
> 
> Oh, oops, nice catch.
> 
> Well, as long as the step, modulo 32 bits, is not 0, it's still
> arguably the 250 KHz / 4 µs period clock from RFC 793, so there's no

Well, except for the fact it's a 31.24 MHz / 32 ns clock.  I assumed
there was a good reason for that.

> practical difference (other than the overflow period). See also the
> note in RFC 1948:
> 
>    More precisely, RFC 793 specifies that the 32-bit counter be
>    incremented by 1 in the low-order position about every 4
>    microseconds.  Instead, Berkeley-derived kernels increment it by a
>    constant every second [...]
> 
> I kind of wonder if adding a time non-linearity like the unintended one
> I added actually increases entropy.

Right, I don't know.

> But indeed ~4 seconds overflow is also not intended, and we should just
> stick to RFC 6528.

Well... 4s overflow, yes, but not I think a 4s period before
repeating.  Again, I don't know enough about this stuff to analyze
whether that's important or not.

> 
> > Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> > ---
> >  tcp.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/tcp.c b/tcp.c
> > index 59e03ff..941fafb 100644
> > --- a/tcp.c
> > +++ b/tcp.c
> > @@ -2027,8 +2027,8 @@ static void tcp_seq_init(const struct ctx *c, struct tcp_conn *conn,
> >  
> >  	seq = siphash_36b((uint8_t *)&in, c->tcp.hash_secret);
> >  
> > -	ns = now->tv_sec * 1E9;
> > -	ns += now->tv_nsec >> 5; /* 32ns ticks, overflows 32 bits every 137s */
> > +	/* 32ns ticks, overflows 32 bits every 137s */
> > +	ns = (now->tv_sec * 1E9 + now->tv_nsec) >> 5;
> >  
> >  	conn->seq_to_tap = seq + ns;
> >  }
> 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2022-11-08  1:02 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-04  8:43 [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets David Gibson
2022-11-04  8:43 ` [PATCH 01/10] tcp: no v6 flag in ref David Gibson
2022-11-07 18:07   ` Stefano Brivio
2022-11-08  0:35     ` David Gibson
2022-11-04  8:43 ` [PATCH 02/10] tcp: Helper to encode IPv4-mapped IPv6 addresses David Gibson
2022-11-07 18:08   ` Stefano Brivio
2022-11-08  0:46     ` David Gibson
2022-11-04  8:43 ` [PATCH 03/10] tcp: Partially unify IPv4 and IPv6 paths in tcp_hash_match() David Gibson
2022-11-07 18:08   ` Stefano Brivio
2022-11-08  0:51     ` David Gibson
2022-11-04  8:43 ` [PATCH 04/10] tcp: Hash IPv4 and IPv4-mapped-IPv6 addresses the same David Gibson
2022-11-04  8:43 ` [PATCH 05/10] tcp: Take tcp_hash_insert() address from struct tcp_conn David Gibson
2022-11-04  8:43 ` [PATCH 06/10] tcp: Unify IPv4 and IPv6 paths for hashing and matching David Gibson
2022-11-04  8:43 ` [PATCH 07/10] tcp: Remove ugly address union from struct tcp_conn David Gibson
2022-11-07 18:08   ` Stefano Brivio
2022-11-08  0:54     ` David Gibson
2022-11-04  8:43 ` [PATCH 08/10] tcp: Unify initial sequence numbers for IPv4 and IPv6 David Gibson
2022-11-04  8:43 ` [PATCH 09/10] tcp: Have tcp_seq_init() take its parameters from struct tcp_conn David Gibson
2022-11-04  8:43 ` [PATCH 10/10] tcp: Fix small error in tcp_seq_init() time handling David Gibson
2022-11-07 18:08   ` Stefano Brivio
2022-11-08  0:59     ` David Gibson
2022-11-04  8:47 ` [PATCH 00/10] RFC: Preliminaries for using share IPv4 & IPv6 sockets Stefano Brivio

Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).