From: Jon Maloy <jmaloy@redhat.com>
To: sbrivio@redhat.com, dgibson@redhat.com,
david@gibson.dropbear.id.au, jmaloy@redhat.com,
passt-dev@passt.top
Subject: [PATCH v4 9/9] fwd: Added cache table for ARP/NDP contents
Date: Tue, 19 Aug 2025 23:10:05 -0400 [thread overview]
Message-ID: <20250820031005.2725591-10-jmaloy@redhat.com> (raw)
In-Reply-To: <20250820031005.2725591-1-jmaloy@redhat.com>
We add a cache table to keep partial contents of the kernel ARP/NDP
tables. This way, we drastically reduce the number of netlink calls
to read those tables.
We create dummy cache entries representing non-(not-yet)-existing
ARP/NDP entries when needed. We add a short expiration time to each
such entry, so that we can know when to make repeated calls to the
kernel tables in the beginning. We also add an access counter to the
entries, to ensure that the timer becomes longer and the call frequency
abates over time if no ARP/NDP entry is created.
For regular entries we use a much longer timer, with the purpose to
update the entry in the rare case that a remote host changes its
MAC address.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
arp.c | 3 +-
conf.c | 2 +
flow.c | 5 +-
fwd.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
fwd.h | 4 ++
ndp.c | 3 +-
tcp.c | 3 +-
7 files changed, 218 insertions(+), 8 deletions(-)
diff --git a/arp.c b/arp.c
index c37867a..040d4fe 100644
--- a/arp.c
+++ b/arp.c
@@ -29,7 +29,6 @@
#include "dhcp.h"
#include "passt.h"
#include "tap.h"
-#include "netlink.h"
/**
* arp() - Check if this is a supported ARP message, reply as needed
@@ -79,7 +78,7 @@ int arp(const struct ctx *c, const struct pool *p)
*/
inany_from_af(&tgt, AF_INET, am->tip);
if (!fwd_inany_nat(c, &tgt))
- nl_neigh_mac_get(nl_sock, &tgt, c->ifi4, am->sha);
+ fwd_neigh_mac_get(c, &tgt, c->ifi4, am->sha);
memcpy(swap, am->tip, sizeof(am->tip));
memcpy(am->tip, am->sip, sizeof(am->tip));
diff --git a/conf.c b/conf.c
index f47f48e..0abdbf7 100644
--- a/conf.c
+++ b/conf.c
@@ -2122,6 +2122,8 @@ void conf(struct ctx *c, int argc, char **argv)
c->udp.fwd_out.mode = fwd_default;
fwd_scan_ports_init(c);
+ if (fwd_mac_cache_init())
+ die("Failed to initiate neighnor MAC cache");
if (!c->quiet)
conf_print(c);
diff --git a/flow.c b/flow.c
index d7b3fd1..3ca30f1 100644
--- a/flow.c
+++ b/flow.c
@@ -459,7 +459,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
*/
ifi = inany_v4(&ini->oaddr) ? c->ifi4 : c->ifi6;
if (!fwd_inany_nat(c, &ini->oaddr))
- nl_neigh_mac_get(nl_sock, &ini->oaddr, ifi, f->tap_omac);
+ fwd_neigh_mac_get(c, &ini->oaddr, ifi, f->tap_omac);
break;
case PIF_SPLICE:
@@ -474,7 +474,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
*/
ifi = inany_v4(&ini->eaddr) ? c->ifi4 : c->ifi6;
if (!fwd_inany_nat(c, &ini->eaddr))
- nl_neigh_mac_get(nl_sock, &ini->eaddr, ifi, f->tap_omac);
+ fwd_neigh_mac_get(c, &ini->eaddr, ifi, f->tap_omac);
break;
default:
@@ -491,6 +491,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
f->pif[TGTSIDE] = tgtpif;
flow_set_state(f, FLOW_STATE_TGT);
+
return tgt;
}
diff --git a/fwd.c b/fwd.c
index 55bf5f2..6647764 100644
--- a/fwd.c
+++ b/fwd.c
@@ -19,6 +19,8 @@
#include <sched.h>
#include <unistd.h>
#include <stdio.h>
+#include <time.h>
+#include <netinet/if_ether.h>
#include "util.h"
#include "ip.h"
@@ -26,6 +28,8 @@
#include "passt.h"
#include "lineread.h"
#include "flow_table.h"
+#include "inany.h"
+#include "netlink.h"
/* Empheral port range: values from RFC 6335 */
static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
@@ -33,6 +37,208 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
+#define MAC_CACHE_BUCKETS 1024
+#define MAC_CACHE_RENEWAL 3600 /* Refresh entry from ARP/NDP every hour */
+
+/* Partial cache of ARP/NDP table contents */
+struct mac_cache_entry {
+ union inany_addr key;
+ unsigned char mac[ETH_ALEN];
+ struct timespec expiry;
+ uint32_t count;
+ struct mac_cache_entry *next;
+};
+
+struct mac_cache_table {
+ struct mac_cache_entry **buckets;
+ size_t nbuckets;
+};
+
+static struct mac_cache_table mac_cache;
+
+/**
+ * timespec_before() - Check the relation between two pints in time
+ * @a: Point in time to be tested
+ * @b: Point in time test a against
+ * Return: True if a comes before b, otherwise b
+ */
+static inline bool timespec_before(const struct timespec *a,
+ const struct timespec *b)
+{
+ return (a->tv_sec < b->tv_sec) ||
+ (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
+}
+
+/**
+ * mac_entry_is_dummy() - Check if a cache entry is a placeholder
+ * @c: Execution context
+ * @e: Cache entry
+ *
+ * Return: True if the entry is a placeholder, false otherwise
+ */
+bool mac_entry_is_dummy(const struct ctx *c, const struct mac_cache_entry *e)
+{
+ return !memcmp(c->our_tap_mac, e->mac, ETH_ALEN);
+}
+
+/**
+ * mac_entry_expired() - Check if a cache entry has expired
+ * @e: Cache entry
+ *
+ * Return: True if the entry has expired, false otherwise
+ */
+static bool mac_entry_expired(const struct mac_cache_entry *e)
+{
+ struct timespec now;
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ return timespec_before(&e->expiry, &now);
+}
+
+/**
+ * mac_entry_set_expiry() - Set the time for a cache entry to expire
+ * @e: Cache entry
+ * @expiry: Expiration time, in seconds from current moment.
+ *
+ * Return: The result of the hash
+ */
+static void mac_entry_set_expiry(struct mac_cache_entry *e, int expiry)
+{
+ clock_gettime(CLOCK_MONOTONIC, &e->expiry);
+ e->expiry.tv_sec += expiry;
+}
+
+/**
+ * inany_hash32() - Hash the contenst of key into an integer
+ * @key: IPv4 or IPv6 address, used as key
+ *
+ * Return: The result of the hash
+ */
+static inline uint32_t inany_hash32(const struct ctx *c,
+ const union inany_addr *key)
+{
+ struct siphash_state st = SIPHASH_INIT(c->hash_secret);
+
+ inany_siphash_feed(&st, key);
+
+ return (uint32_t)siphash_final(&st, sizeof(*key), 0);
+}
+
+/**
+ * fwd_mac_cache_bucket_idx() - Find the table index of an entry
+ * @c: Execution context
+ * @key: IPv4 or IPv6 address, used as key for the hash lookup
+ * @nbuckets: Number of buckets in the table
+ *
+ * Return: The index found
+ */
+static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c,
+ const union inany_addr *key,
+ size_t nbuckets)
+{
+ uint32_t h = inany_hash32(c, key);
+
+ return (nbuckets & (nbuckets - 1)) ? (h % nbuckets) : (h & (nbuckets - 1));
+}
+
+/**
+ * fwd_mac_cache_find() - Find an entry in the ARP/NDP cache table
+ * @c: Execution context
+ * @key: IPv4 or IPv6 address, used as key for the hash lookup
+ *
+ * Return: Pointer to the entry on success, NULL on failure.
+ */
+static struct mac_cache_entry *fwd_mac_cache_find(const struct ctx *c,
+ const union inany_addr *key)
+{
+ const struct mac_cache_table *t = &mac_cache;
+ struct mac_cache_entry *e;
+ size_t idx;
+
+ idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
+ for (e = t->buckets[idx]; e; e = e->next)
+ if (inany_equals(&e->key, key))
+ return e;
+ return NULL;
+}
+
+/**
+ * fwd_mac_cache_add() - Add a new entry to the ARP/NDP cache table
+ * @c: Execution context
+ * @key: IPv4 or IPv6 address, used as key for the hash lookup
+ * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable
+ *
+ * Return: Pointer to the new entry on success, NULL on failure.
+ */
+static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c,
+ const union inany_addr *key,
+ const unsigned char *mac)
+{
+ struct mac_cache_table *t = &mac_cache;
+ size_t idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
+ struct mac_cache_entry *e;
+
+ e = calloc(1, sizeof(*e));
+ if (!e)
+ return NULL;
+ e->key = *key;
+ memcpy(e->mac, mac, ETH_ALEN);
+ e->count = 0;
+ e->next = t->buckets[idx];
+ t->buckets[idx] = e;
+
+ return e;
+}
+
+/**
+ * fwd_neigh_mac_get() - Find a MAC address the ARP/NDP cache table
+ * @c: Execution context
+ * @addr: IPv4 or IPv6 address
+ * @ifi: Interface index
+ * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable
+ *
+ * Return: 0 on success, -1 on failure.
+ */
+int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ int ifi, unsigned char *mac)
+{
+ struct mac_cache_entry *e = fwd_mac_cache_find(c, addr);
+ bool refresh = false;
+
+ if (e)
+ refresh = mac_entry_expired(e);
+ else if ((e = fwd_mac_cache_add(c, addr, mac)))
+ refresh = true;
+ else
+ return -1;
+
+ if (refresh) {
+ nl_neigh_mac_get(nl_sock, addr, ifi, e->mac);
+ if (mac_entry_is_dummy(c, e))
+ mac_entry_set_expiry(e, e->count++);
+ else
+ mac_entry_set_expiry(e, MAC_CACHE_RENEWAL);
+ }
+ memcpy(mac, e->mac, ETH_ALEN);
+
+ return 0;
+}
+
+/**
+ * fwd_mac_cache_init() - Initiate ARP/NDP cache table
+ *
+ * Return: 0 on success, -1 on failure.
+ */
+int fwd_mac_cache_init(void)
+{
+ struct mac_cache_table *t = &mac_cache;
+
+ t->nbuckets = MAC_CACHE_BUCKETS;
+ t->buckets = calloc(t->nbuckets, sizeof(*t->buckets));
+
+ return t->buckets ? 0 : -1;
+}
+
/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
*
* Work out what ports the host thinks are emphemeral and record it for later
diff --git a/fwd.h b/fwd.h
index c8d485d..ebfc114 100644
--- a/fwd.h
+++ b/fwd.h
@@ -58,4 +58,8 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
bool fwd_inany_nat(const struct ctx *c, const union inany_addr *addr);
+int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ int ifi, unsigned char *mac);
+int fwd_mac_cache_init(void);
+
#endif /* FWD_H */
diff --git a/ndp.c b/ndp.c
index 19b9b28..af957c9 100644
--- a/ndp.c
+++ b/ndp.c
@@ -32,7 +32,6 @@
#include "passt.h"
#include "tap.h"
#include "log.h"
-#include "netlink.h"
#define RT_LIFETIME 65535
@@ -222,7 +221,7 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
*/
inany_from_af(&tgt, AF_INET6, addr);
if (!fwd_inany_nat(c, &tgt))
- nl_neigh_mac_get(nl_sock, &tgt, c->ifi6, na.target_l2_addr.mac);
+ fwd_neigh_mac_get(c, &tgt, c->ifi6, na.target_l2_addr.mac);
ndp_send(c, dst, &na, sizeof(na));
}
diff --git a/tcp.c b/tcp.c
index 28f9ef5..6b5b4ff 100644
--- a/tcp.c
+++ b/tcp.c
@@ -309,7 +309,6 @@
#include "tcp_internal.h"
#include "tcp_buf.h"
#include "tcp_vu.h"
-#include "netlink.h"
#ifndef __USE_MISC
/* From Linux UAPI, missing in netinet/tcp.h provided by musl */
@@ -1908,7 +1907,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
memcpy(src_mac, c->our_tap_mac, ETH_ALEN);
inany_from_af(&tgt, af, daddr);
if (!fwd_inany_nat(c, &tgt))
- nl_neigh_mac_get(nl_sock, &tgt, ifi, src_mac);
+ fwd_neigh_mac_get(c, &tgt, ifi, src_mac);
if (af == AF_INET) {
struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
--
@@ -309,7 +309,6 @@
#include "tcp_internal.h"
#include "tcp_buf.h"
#include "tcp_vu.h"
-#include "netlink.h"
#ifndef __USE_MISC
/* From Linux UAPI, missing in netinet/tcp.h provided by musl */
@@ -1908,7 +1907,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
memcpy(src_mac, c->our_tap_mac, ETH_ALEN);
inany_from_af(&tgt, af, daddr);
if (!fwd_inany_nat(c, &tgt))
- nl_neigh_mac_get(nl_sock, &tgt, ifi, src_mac);
+ fwd_neigh_mac_get(c, &tgt, ifi, src_mac);
if (af == AF_INET) {
struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
--
2.50.1
next prev parent reply other threads:[~2025-08-20 3:10 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-20 3:09 [PATCH v4 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
2025-08-20 3:09 ` [PATCH v4 1/9] netlink: add function to extract MAC addresses from NDP/ARP table Jon Maloy
2025-08-21 0:57 ` David Gibson
2025-08-20 3:09 ` [PATCH v4 2/9] arp/ndp: respond with true MAC address of LAN local remote hosts Jon Maloy
2025-08-21 1:18 ` David Gibson
2025-08-20 3:09 ` [PATCH v4 3/9] flow: add MAC address of LAN local remote hosts to flow Jon Maloy
2025-08-21 1:28 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 4/9] udp: forward external source MAC address through tap interface Jon Maloy
2025-08-21 1:32 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 5/9] tcp: " Jon Maloy
2025-08-21 1:37 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 6/9] tap: change signature of function tap_push_l2h() Jon Maloy
2025-08-21 1:39 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 7/9] tcp: make tcp_rst_no_conn() respond with correct MAC address Jon Maloy
2025-08-21 1:46 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 8/9] icmp: let icmp use mac address from flowside structure Jon Maloy
2025-08-21 1:51 ` David Gibson
2025-08-20 3:10 ` Jon Maloy [this message]
2025-08-21 2:03 ` [PATCH v4 9/9] fwd: Added cache table for ARP/NDP contents David Gibson
2025-08-21 10:53 ` Stefano Brivio
2025-08-25 1:48 ` David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250820031005.2725591-10-jmaloy@redhat.com \
--to=jmaloy@redhat.com \
--cc=david@gibson.dropbear.id.au \
--cc=dgibson@redhat.com \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).