On Tue, Aug 19, 2025 at 11:10:05PM -0400, Jon Maloy wrote: > We add a cache table to keep partial contents of the kernel ARP/NDP > tables. This way, we drastically reduce the number of netlink calls > to read those tables. Do you have data to suggest this is necessary? It's a lot of code to optimise something only needed for some pretty uncommon cases. > We create dummy cache entries representing non-(not-yet)-existing > ARP/NDP entries when needed. We add a short expiration time to each > such entry, so that we can know when to make repeated calls to the > kernel tables in the beginning. We also add an access counter to the > entries, to ensure that the timer becomes longer and the call frequency > abates over time if no ARP/NDP entry is created. > > For regular entries we use a much longer timer, with the purpose to > update the entry in the rare case that a remote host changes its > MAC address. > > Signed-off-by: Jon Maloy > --- > arp.c | 3 +- > conf.c | 2 + > flow.c | 5 +- > fwd.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > fwd.h | 4 ++ > ndp.c | 3 +- > tcp.c | 3 +- > 7 files changed, 218 insertions(+), 8 deletions(-) > > diff --git a/arp.c b/arp.c > index c37867a..040d4fe 100644 > --- a/arp.c > +++ b/arp.c > @@ -29,7 +29,6 @@ > #include "dhcp.h" > #include "passt.h" > #include "tap.h" > -#include "netlink.h" > > /** > * arp() - Check if this is a supported ARP message, reply as needed > @@ -79,7 +78,7 @@ int arp(const struct ctx *c, const struct pool *p) > */ > inany_from_af(&tgt, AF_INET, am->tip); > if (!fwd_inany_nat(c, &tgt)) > - nl_neigh_mac_get(nl_sock, &tgt, c->ifi4, am->sha); > + fwd_neigh_mac_get(c, &tgt, c->ifi4, am->sha); > > memcpy(swap, am->tip, sizeof(am->tip)); > memcpy(am->tip, am->sip, sizeof(am->tip)); > diff --git a/conf.c b/conf.c > index f47f48e..0abdbf7 100644 > --- a/conf.c > +++ b/conf.c > @@ -2122,6 +2122,8 @@ void conf(struct ctx *c, int argc, char **argv) > c->udp.fwd_out.mode = fwd_default; > > fwd_scan_ports_init(c); > + if (fwd_mac_cache_init()) > + die("Failed to initiate neighnor MAC cache"); "neighnor" > if (!c->quiet) > conf_print(c); > diff --git a/flow.c b/flow.c > index d7b3fd1..3ca30f1 100644 > --- a/flow.c > +++ b/flow.c > @@ -459,7 +459,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow, > */ > ifi = inany_v4(&ini->oaddr) ? c->ifi4 : c->ifi6; > if (!fwd_inany_nat(c, &ini->oaddr)) > - nl_neigh_mac_get(nl_sock, &ini->oaddr, ifi, f->tap_omac); > + fwd_neigh_mac_get(c, &ini->oaddr, ifi, f->tap_omac); > break; > > case PIF_SPLICE: > @@ -474,7 +474,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow, > */ > ifi = inany_v4(&ini->eaddr) ? c->ifi4 : c->ifi6; > if (!fwd_inany_nat(c, &ini->eaddr)) > - nl_neigh_mac_get(nl_sock, &ini->eaddr, ifi, f->tap_omac); > + fwd_neigh_mac_get(c, &ini->eaddr, ifi, f->tap_omac); > break; > > default: > @@ -491,6 +491,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow, > > f->pif[TGTSIDE] = tgtpif; > flow_set_state(f, FLOW_STATE_TGT); > + > return tgt; > } > > diff --git a/fwd.c b/fwd.c > index 55bf5f2..6647764 100644 > --- a/fwd.c > +++ b/fwd.c > @@ -19,6 +19,8 @@ > #include > #include > #include > +#include > +#include > > #include "util.h" > #include "ip.h" > @@ -26,6 +28,8 @@ > #include "passt.h" > #include "lineread.h" > #include "flow_table.h" > +#include "inany.h" > +#include "netlink.h" > > /* Empheral port range: values from RFC 6335 */ > static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14); > @@ -33,6 +37,208 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1; > > #define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range" > > +#define MAC_CACHE_BUCKETS 1024 > +#define MAC_CACHE_RENEWAL 3600 /* Refresh entry from ARP/NDP every hour */ > + > +/* Partial cache of ARP/NDP table contents */ > +struct mac_cache_entry { > + union inany_addr key; > + unsigned char mac[ETH_ALEN]; > + struct timespec expiry; > + uint32_t count; > + struct mac_cache_entry *next; > +}; > + > +struct mac_cache_table { > + struct mac_cache_entry **buckets; > + size_t nbuckets; > +}; > + > +static struct mac_cache_table mac_cache; > + > +/** > + * timespec_before() - Check the relation between two pints in time > + * @a: Point in time to be tested > + * @b: Point in time test a against > + * Return: True if a comes before b, otherwise b > + */ > +static inline bool timespec_before(const struct timespec *a, > + const struct timespec *b) > +{ > + return (a->tv_sec < b->tv_sec) || > + (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec); > +} > + > +/** > + * mac_entry_is_dummy() - Check if a cache entry is a placeholder > + * @c: Execution context > + * @e: Cache entry > + * > + * Return: True if the entry is a placeholder, false otherwise > + */ > +bool mac_entry_is_dummy(const struct ctx *c, const struct mac_cache_entry *e) > +{ > + return !memcmp(c->our_tap_mac, e->mac, ETH_ALEN); > +} > + > +/** > + * mac_entry_expired() - Check if a cache entry has expired > + * @e: Cache entry > + * > + * Return: True if the entry has expired, false otherwise > + */ > +static bool mac_entry_expired(const struct mac_cache_entry *e) > +{ > + struct timespec now; > + > + clock_gettime(CLOCK_MONOTONIC, &now); > + return timespec_before(&e->expiry, &now); > +} > + > +/** > + * mac_entry_set_expiry() - Set the time for a cache entry to expire > + * @e: Cache entry > + * @expiry: Expiration time, in seconds from current moment. > + * > + * Return: The result of the hash > + */ > +static void mac_entry_set_expiry(struct mac_cache_entry *e, int expiry) > +{ > + clock_gettime(CLOCK_MONOTONIC, &e->expiry); > + e->expiry.tv_sec += expiry; > +} > + > +/** > + * inany_hash32() - Hash the contenst of key into an integer > + * @key: IPv4 or IPv6 address, used as key > + * > + * Return: The result of the hash > + */ > +static inline uint32_t inany_hash32(const struct ctx *c, > + const union inany_addr *key) > +{ > + struct siphash_state st = SIPHASH_INIT(c->hash_secret); > + > + inany_siphash_feed(&st, key); > + > + return (uint32_t)siphash_final(&st, sizeof(*key), 0); > +} > + > +/** > + * fwd_mac_cache_bucket_idx() - Find the table index of an entry > + * @c: Execution context > + * @key: IPv4 or IPv6 address, used as key for the hash lookup > + * @nbuckets: Number of buckets in the table > + * > + * Return: The index found > + */ > +static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c, > + const union inany_addr *key, > + size_t nbuckets) > +{ > + uint32_t h = inany_hash32(c, key); > + > + return (nbuckets & (nbuckets - 1)) ? (h % nbuckets) : (h & (nbuckets - 1)); I don't think this is worth the complexity: I wouldn't be confident the conditional branch is less expensive than an integer divmod. And if it is, it wouldn't suprise me if the compiler did that optimisation itself. > +} > + > +/** > + * fwd_mac_cache_find() - Find an entry in the ARP/NDP cache table > + * @c: Execution context > + * @key: IPv4 or IPv6 address, used as key for the hash lookup > + * > + * Return: Pointer to the entry on success, NULL on failure. > + */ > +static struct mac_cache_entry *fwd_mac_cache_find(const struct ctx *c, > + const union inany_addr *key) > +{ > + const struct mac_cache_table *t = &mac_cache; > + struct mac_cache_entry *e; > + size_t idx; > + > + idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets); > + for (e = t->buckets[idx]; e; e = e->next) > + if (inany_equals(&e->key, key)) > + return e; > + return NULL; > +} > + > +/** > + * fwd_mac_cache_add() - Add a new entry to the ARP/NDP cache table > + * @c: Execution context > + * @key: IPv4 or IPv6 address, used as key for the hash lookup > + * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable > + * > + * Return: Pointer to the new entry on success, NULL on failure. > + */ > +static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c, > + const union inany_addr *key, > + const unsigned char *mac) > +{ > + struct mac_cache_table *t = &mac_cache; > + size_t idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets); > + struct mac_cache_entry *e; > + > + e = calloc(1, sizeof(*e)); > + if (!e) > + return NULL; > + e->key = *key; > + memcpy(e->mac, mac, ETH_ALEN); > + e->count = 0; > + e->next = t->buckets[idx]; > + t->buckets[idx] = e; > + > + return e; > +} > + > +/** > + * fwd_neigh_mac_get() - Find a MAC address the ARP/NDP cache table > + * @c: Execution context > + * @addr: IPv4 or IPv6 address > + * @ifi: Interface index > + * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable > + * > + * Return: 0 on success, -1 on failure. > + */ > +int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr, > + int ifi, unsigned char *mac) > +{ > + struct mac_cache_entry *e = fwd_mac_cache_find(c, addr); > + bool refresh = false; > + > + if (e) > + refresh = mac_entry_expired(e); > + else if ((e = fwd_mac_cache_add(c, addr, mac))) > + refresh = true; > + else > + return -1; > + > + if (refresh) { > + nl_neigh_mac_get(nl_sock, addr, ifi, e->mac); > + if (mac_entry_is_dummy(c, e)) > + mac_entry_set_expiry(e, e->count++); > + else > + mac_entry_set_expiry(e, MAC_CACHE_RENEWAL); > + } > + memcpy(mac, e->mac, ETH_ALEN); > + > + return 0; > +} > + > +/** > + * fwd_mac_cache_init() - Initiate ARP/NDP cache table > + * > + * Return: 0 on success, -1 on failure. > + */ > +int fwd_mac_cache_init(void) > +{ > + struct mac_cache_table *t = &mac_cache; > + > + t->nbuckets = MAC_CACHE_BUCKETS; > + t->buckets = calloc(t->nbuckets, sizeof(*t->buckets)); > + > + return t->buckets ? 0 : -1; > +} > + > /** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral > * > * Work out what ports the host thinks are emphemeral and record it for later > diff --git a/fwd.h b/fwd.h > index c8d485d..ebfc114 100644 > --- a/fwd.h > +++ b/fwd.h > @@ -58,4 +58,8 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, > const struct flowside *ini, struct flowside *tgt); > bool fwd_inany_nat(const struct ctx *c, const union inany_addr *addr); > > +int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr, > + int ifi, unsigned char *mac); > +int fwd_mac_cache_init(void); > + > #endif /* FWD_H */ > diff --git a/ndp.c b/ndp.c > index 19b9b28..af957c9 100644 > --- a/ndp.c > +++ b/ndp.c > @@ -32,7 +32,6 @@ > #include "passt.h" > #include "tap.h" > #include "log.h" > -#include "netlink.h" > > #define RT_LIFETIME 65535 > > @@ -222,7 +221,7 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst, > */ > inany_from_af(&tgt, AF_INET6, addr); > if (!fwd_inany_nat(c, &tgt)) > - nl_neigh_mac_get(nl_sock, &tgt, c->ifi6, na.target_l2_addr.mac); > + fwd_neigh_mac_get(c, &tgt, c->ifi6, na.target_l2_addr.mac); > > ndp_send(c, dst, &na, sizeof(na)); > } > diff --git a/tcp.c b/tcp.c > index 28f9ef5..6b5b4ff 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -309,7 +309,6 @@ > #include "tcp_internal.h" > #include "tcp_buf.h" > #include "tcp_vu.h" > -#include "netlink.h" > > #ifndef __USE_MISC > /* From Linux UAPI, missing in netinet/tcp.h provided by musl */ > @@ -1908,7 +1907,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af, > memcpy(src_mac, c->our_tap_mac, ETH_ALEN); > inany_from_af(&tgt, af, daddr); > if (!fwd_inany_nat(c, &tgt)) > - nl_neigh_mac_get(nl_sock, &tgt, ifi, src_mac); > + fwd_neigh_mac_get(c, &tgt, ifi, src_mac); > > if (af == AF_INET) { > struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP); -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson