public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: passt-dev@passt.top, Stefano Brivio <sbrivio@redhat.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Subject: [PATCH 1/3] tcp: Switch hash table to linear probing instead of chaining
Date: Mon,  4 Dec 2023 14:16:09 +1100	[thread overview]
Message-ID: <20231204031611.3566791-2-david@gibson.dropbear.id.au> (raw)
In-Reply-To: <20231204031611.3566791-1-david@gibson.dropbear.id.au>

Currently we deal with hash collisions by letting a hash bucket contain
multiple entries, forming a linked list using an index in the connection
structure.

That's a pretty standard and simple approach, but in our case we can use
an even simpler one: linear probing.  Here if a hash bucket is occupied
we just move onto the next one until we find a feww one.  This slightly
simplifies lookup and more importantly saves some precious bytes in the
connection structure by removing the need for a link.  It does require some
additional complexity for hash removal.

This approach can perform poorly with hash table load is high.  However, we
already size our hash table of pointers larger than the connection table,
which puts an upper bound on the load.  It's relatively cheap to decrease
that bound if we find we need to.

I adapted the linear probing operations from Knuth's The Art of Computer
Programming, Volume 3, 2nd Edition.  Specifically Algorithm L and Algorithm
R in Section 6.4.  Note that there is an error in Algorithm R as printed,
see errata at [0].

[0] https://www-cs-faculty.stanford.edu/~knuth/all3-prepre.ps.gz

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
---
 tcp.c      | 111 +++++++++++++++++++++++++++--------------------------
 tcp_conn.h |   2 -
 util.h     |  13 +++++++
 3 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/tcp.c b/tcp.c
index 17c7cba..09acf7f 100644
--- a/tcp.c
+++ b/tcp.c
@@ -573,22 +573,12 @@ static unsigned int tcp6_l2_flags_buf_used;
 
 #define CONN(idx)		(&(FLOW(idx)->tcp))
 
-/** conn_at_idx() - Find a connection by index, if present
- * @idx:	Index of connection to lookup
- *
- * Return: pointer to connection, or NULL if @idx is out of bounds
- */
-static inline struct tcp_tap_conn *conn_at_idx(unsigned idx)
-{
-	if (idx >= FLOW_MAX)
-		return NULL;
-	ASSERT(CONN(idx)->f.type == FLOW_TCP);
-	return CONN(idx);
-}
-
 /* Table for lookup from remote address, local port, remote port */
 static struct tcp_tap_conn *tc_hash[TCP_HASH_TABLE_SIZE];
 
+static_assert(ARRAY_SIZE(tc_hash) >= FLOW_MAX,
+	"Safe linear probing requires hash table larger than connection table");
+
 /* Pools for pre-opened sockets (in init) */
 int init_sock_pool4		[TCP_SOCK_POOL_SIZE];
 int init_sock_pool6		[TCP_SOCK_POOL_SIZE];
@@ -1196,6 +1186,27 @@ static unsigned int tcp_conn_hash(const struct ctx *c,
 	return tcp_hash(c, &conn->faddr, conn->eport, conn->fport);
 }
 
+/**
+ * tcp_hash_probe() - Find hash bucket for a connection
+ * @c:		Execution context
+ * @conn:	Connection to find bucket for
+ *
+ * Return: If @conn is in the table, its current bucket, otherwise a suitable
+ *         free bucket for it.
+ */
+static inline unsigned tcp_hash_probe(const struct ctx *c,
+				      const struct tcp_tap_conn *conn)
+{
+	unsigned b;
+
+	/* Linear probing */
+	for (b = tcp_conn_hash(c, conn); tc_hash[b] && tc_hash[b] != conn;
+	     b = (b + 1) % TCP_HASH_TABLE_SIZE)
+		;
+
+	return b;
+}
+
 /**
  * tcp_hash_insert() - Insert connection into hash table, chain link
  * @c:		Execution context
@@ -1203,14 +1214,10 @@ static unsigned int tcp_conn_hash(const struct ctx *c,
  */
 static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn)
 {
-	int b;
+	unsigned b = tcp_hash_probe(c, conn);
 
-	b = tcp_hash(c, &conn->faddr, conn->eport, conn->fport);
-	conn->next_index = tc_hash[b] ? FLOW_IDX(tc_hash[b]) : -1U;
 	tc_hash[b] = conn;
-
-	flow_dbg(conn, "hash table insert: sock %i, bucket: %i, next: %p",
-		 conn->sock, b, (void *)conn_at_idx(conn->next_index));
+	flow_dbg(conn, "hash table insert: sock %i, bucket: %u", conn->sock, b);
 }
 
 /**
@@ -1221,23 +1228,27 @@ static void tcp_hash_insert(const struct ctx *c, struct tcp_tap_conn *conn)
 static void tcp_hash_remove(const struct ctx *c,
 			    const struct tcp_tap_conn *conn)
 {
-	struct tcp_tap_conn *entry, *prev = NULL;
-	int b = tcp_conn_hash(c, conn);
+	unsigned b = tcp_hash_probe(c, conn), s;
 
-	for (entry = tc_hash[b]; entry;
-	     prev = entry, entry = conn_at_idx(entry->next_index)) {
-		if (entry == conn) {
-			if (prev)
-				prev->next_index = conn->next_index;
-			else
-				tc_hash[b] = conn_at_idx(conn->next_index);
-			break;
+	if (!tc_hash[b])
+		return; /* Redundant remove */
+
+	flow_dbg(conn, "hash table remove: sock %i, bucket: %u", conn->sock, b);
+
+	/* Scan the remainder of the cluster */
+	for (s = (b + 1) % TCP_HASH_TABLE_SIZE; tc_hash[s];
+	     s = (s + 1) % TCP_HASH_TABLE_SIZE) {
+		unsigned h = tcp_conn_hash(c, tc_hash[s]);
+
+		if (in_mod_range(h, b, s, TCP_HASH_TABLE_SIZE)) {
+			/* tc_hash[s] can live in tc_hash[b]'s slot */
+			debug("hash table remove: shuffle %u -> %u", s, b);
+			tc_hash[b] = tc_hash[s];
+			b = s;
 		}
 	}
 
-	flow_dbg(conn, "hash table remove: sock %i, bucket: %i, new: %p",
-		 conn->sock, b,
-		 (void *)(prev ? conn_at_idx(prev->next_index) : tc_hash[b]));
+	tc_hash[b] = NULL;
 }
 
 /**
@@ -1250,24 +1261,15 @@ void tcp_tap_conn_update(const struct ctx *c, struct tcp_tap_conn *old,
 			 struct tcp_tap_conn *new)
 
 {
-	struct tcp_tap_conn *entry, *prev = NULL;
-	int b = tcp_conn_hash(c, old);
+	unsigned b = tcp_hash_probe(c, old);
 
-	for (entry = tc_hash[b]; entry;
-	     prev = entry, entry = conn_at_idx(entry->next_index)) {
-		if (entry == old) {
-			if (prev)
-				prev->next_index = FLOW_IDX(new);
-			else
-				tc_hash[b] = new;
-			break;
-		}
-	}
+	if (!tc_hash[b])
+		return; /* Not in hash table, nothing to update */
+
+	tc_hash[b] = new;
 
 	debug("TCP: hash table update: old index %u, new index %u, sock %i, "
-	      "bucket: %i, old: %p, new: %p",
-	      FLOW_IDX(old), FLOW_IDX(new), new->sock, b,
-	      (void *)old, (void *)new);
+	      "bucket: %u", FLOW_IDX(old), FLOW_IDX(new), new->sock, b);
 
 	tcp_epoll_ctl(c, new);
 }
@@ -1287,17 +1289,16 @@ static struct tcp_tap_conn *tcp_hash_lookup(const struct ctx *c,
 					    in_port_t eport, in_port_t fport)
 {
 	union inany_addr aany;
-	struct tcp_tap_conn *conn;
-	int b;
+	unsigned b;
 
 	inany_from_af(&aany, af, faddr);
-	b = tcp_hash(c, &aany, eport, fport);
-	for (conn = tc_hash[b]; conn; conn = conn_at_idx(conn->next_index)) {
-		if (tcp_hash_match(conn, &aany, eport, fport))
-			return conn;
-	}
 
-	return NULL;
+	for (b = tcp_hash(c, &aany, eport, fport);
+	     tc_hash[b] && !tcp_hash_match(tc_hash[b], &aany, eport, fport);
+	     b = (b + 1) % TCP_HASH_TABLE_SIZE)
+		;
+
+	return tc_hash[b];
 }
 
 /**
diff --git a/tcp_conn.h b/tcp_conn.h
index 3900305..e3400bb 100644
--- a/tcp_conn.h
+++ b/tcp_conn.h
@@ -13,7 +13,6 @@
  * struct tcp_tap_conn - Descriptor for a TCP connection (not spliced)
  * @f:			Generic flow information
  * @in_epoll:		Is the connection in the epoll set?
- * @next_index:		Connection index of next item in hash chain, -1 for none
  * @tap_mss:		MSS advertised by tap/guest, rounded to 2 ^ TCP_MSS_BITS
  * @sock:		Socket descriptor number
  * @events:		Connection events, implying connection states
@@ -40,7 +39,6 @@ struct tcp_tap_conn {
 	struct flow_common f;
 
 	bool		in_epoll	:1;
-	unsigned 	next_index	:FLOW_INDEX_BITS + 2;
 
 #define TCP_RETRANS_BITS		3
 	unsigned int	retrans		:TCP_RETRANS_BITS;
diff --git a/util.h b/util.h
index b1106e8..4550a35 100644
--- a/util.h
+++ b/util.h
@@ -225,6 +225,19 @@ int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
 
+/**
+ * in_mod_range() - Determine if a value is in a cyclic range
+ * @x, @i, @j:	Unsigned values < @m
+ * @m:		Modulus
+ *
+ * Returns true iff @x is in the cyclic range of values from @i..@j (mod @m),
+ * inclusive of @i, exclusive of @j.
+ */
+static inline bool in_mod_range(unsigned x, unsigned i, unsigned j, unsigned m)
+{
+	return ((x - i) % m) < ((j - i) % m);
+}
+
 /*
  * Workarounds for https://github.com/llvm/llvm-project/issues/58992
  *
-- 
@@ -225,6 +225,19 @@ int __daemon(int pidfile_fd, int devnull_fd);
 int fls(unsigned long x);
 int write_file(const char *path, const char *buf);
 
+/**
+ * in_mod_range() - Determine if a value is in a cyclic range
+ * @x, @i, @j:	Unsigned values < @m
+ * @m:		Modulus
+ *
+ * Returns true iff @x is in the cyclic range of values from @i..@j (mod @m),
+ * inclusive of @i, exclusive of @j.
+ */
+static inline bool in_mod_range(unsigned x, unsigned i, unsigned j, unsigned m)
+{
+	return ((x - i) % m) < ((j - i) % m);
+}
+
 /*
  * Workarounds for https://github.com/llvm/llvm-project/issues/58992
  *
-- 
2.43.0


  reply	other threads:[~2023-12-04  3:16 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-04  3:16 [PATCH 0/3] RFC: TCP hash change changes, in preparation for flow table David Gibson
2023-12-04  3:16 ` David Gibson [this message]
2023-12-06 22:43   ` [PATCH 1/3] tcp: Switch hash table to linear probing instead of chaining Stefano Brivio
2023-12-07  4:11     ` David Gibson
2023-12-11  9:00       ` Stefano Brivio
2023-12-04  3:16 ` [PATCH 2/3] tcp: Implement hash table with indices rather than pointers David Gibson
2023-12-06 19:37   ` Stefano Brivio
2023-12-07  1:04     ` David Gibson
2023-12-07  5:10       ` David Gibson
2023-12-07  6:20       ` Stefano Brivio
2023-12-04  3:16 ` [PATCH 3/3] tcp: Don't account for hash table size in tcp_hash() David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231204031611.3566791-2-david@gibson.dropbear.id.au \
    --to=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).