On Tue, Sep 23, 2025 at 09:13:23PM -0400, Jon Maloy wrote: > We add a cache table to keep track of the contents of the kernel ARP > and NDP tables. The table is fed from the just introduced netlink based > neigbour subscription function. The new table eliminates the need for > explicit netlink calls to find a host's MAC address. > > Signed-off-by: Jon Maloy > > --- > v5: - Moved to earlier in series to reduce rebase conflicts > v6: - Sqashed the hash list commit and the FIFO/LRU queue commit > - Removed hash lookup. We now only use linear lookup in a > linked list > - Eliminated dynamic memory allocation. > - Ensured there is only one call to clock_gettime() > - Using MAC_ZERO instead of the previously dedicated definitions > v7: - NOW using MAC_ZERO where needed > - I am still using linear back-off for empty cache entries. Even > an incoming, flow-creating packet from a local host gives no > guarantee that its MAC address is in the ARP table, so we must > allow for a few new attempts at first possible occasions. Only > after several failed lookups can we conclude that we probably > never will succeed. Hence the back-off. > - Fixed a bug that David inadvertently made me aware of: I only > intended to set the initial expiry value to MAC_CACHE_RENEWAL > when an ARP/NDP table lookup was successful. > - Improved struct and function description comments. > v8: - Total re-design of table, adapting to the new, subscription > based way of updating it. > v9: - Catering for MAC address change for an existing host. > --- > conf.c | 1 + > fwd.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > fwd.h | 7 +++ > netlink.c | 15 +++-- > 4 files changed, 180 insertions(+), 5 deletions(-) > > diff --git a/conf.c b/conf.c > index 66b9e63..cc491c3 100644 > --- a/conf.c > +++ b/conf.c > @@ -2133,6 +2133,7 @@ void conf(struct ctx *c, int argc, char **argv) > c->udp.fwd_out.mode = fwd_default; > > fwd_scan_ports_init(c); > + fwd_neigh_cache_init(); > > if (!c->quiet) > conf_print(c); > diff --git a/fwd.c b/fwd.c > index 250cf56..491303d 100644 > --- a/fwd.c > +++ b/fwd.c > @@ -32,6 +32,168 @@ static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14); > static in_port_t fwd_ephemeral_max = NUM_PORTS - 1; > > #define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range" > +#define MAC_CACHE_SIZE 1024 Nit: you use "mac cache" some places, "neigh cache" others. > + > +/** > + * mac_cache_entry - Entry in the ARP/NDP table cache > + * @next: Next entry in slot or free list > + * @addr: IP address of represented host > + * @mac: MAC address of represented host > + */ > +struct mac_cache_entry { > + struct mac_cache_entry *next; Hash buckets might be overkill, but they're pretty easy and cheap, so whatever. > + union inany_addr addr; > + uint8_t mac[ETH_ALEN]; > +}; > + > +/** > + * mac_cache_table - Cache of ARP/NDP table contents > + * @entries: Entries to be plugged into the hash slots when allocated > + * @slots: Hash table slots > + * @free: Linked list of unused entries > + */ > +struct mac_cache_table { > + struct mac_cache_entry entries[MAC_CACHE_SIZE]; > + struct mac_cache_entry *slots[MAC_CACHE_SIZE]; There's no inherent reason these two tables need the same size (and indeed some reasons they might not want to have the same size). > + struct mac_cache_entry *free; > +}; > + > +static struct mac_cache_table mac_cache; > + > +/** > + * fwd_mac_cache_slot_idx() - Hash key to a number within the table range > + * @c: Execution context > + * @key: The key to be used for the hash > + * > + * Return: The resulting hash value > + */ > +static inline size_t fwd_mac_cache_slot_idx(const struct ctx *c, > + const union inany_addr *key) Maybe just neigh_hash_bucket()? > +{ > + struct siphash_state st = SIPHASH_INIT(c->hash_secret); > + uint32_t i; > + > + inany_siphash_feed(&st, key); > + i = siphash_final(&st, sizeof(*key), 0); > + > + return ((size_t)i) & (MAC_CACHE_SIZE - 1); This subtly depends on MAC_CACHE_SIZE being a power of 2. If you use (... % MAC_CACHE_SIZE) it will work regardless, and the compiler will still optimize if it _is_ a power of 2. > +} > + > +/** > + * fwd_mac_cache_find() - Find a MAC cache table entry > + * @c: Execution context > + * @addr: Neighbour address to be used as key for the lookup > + * > + * Return: The found entry, if any. Otherwise NULL. > + */ > +static struct mac_cache_entry *fwd_mac_cache_find(const struct ctx *c, > + const union inany_addr *addr) > +{ > + size_t idx = fwd_mac_cache_slot_idx(c, addr); > + struct mac_cache_entry *e = mac_cache.slots[idx]; > + > + while (e && !inany_equals(&e->addr, addr)) > + e = e->next; > + > + return e; > +} > + > +/** > + * fwd_mac_cache_alloc() - Allocate a mac cache table entry from the free list > + * @c: Execution context > + * @addr: Address used to determine insertion slot and store in entry > + * @mac: The MAC address associated with the neighbour address > + */ > +void fwd_neigh_mac_cache_alloc(const struct ctx *c, > + const union inany_addr *addr, uint8_t *mac) This can update as well as allocating. Maybe fwd_neigh_cache_set()? > +{ > + struct mac_cache_table *t = &mac_cache; > + struct mac_cache_entry *e; > + ssize_t idx; > + > + /* MAC address might change sometimes */ > + e = fwd_mac_cache_find(c, addr); > + if (e) { > + memcpy(e->mac, mac, ETH_ALEN); > + return; > + } > + > + e = t->free; > + if (!e) > + return; > + t->free = e->next; > + > + idx = fwd_mac_cache_slot_idx(c, addr); > + e->next = t->slots[idx]; > + t->slots[idx] = e; > + > + memcpy(&e->addr, addr, sizeof(*addr)); > + memcpy(e->mac, mac, ETH_ALEN); > +} > + > +/** > + * fwd_mac_cache_free() - Remove an entry from a slot and add it to free list > + * @c: Execution context > + * @addr: Neighbour address used to find the slot for the entry > + */ > +void fwd_neigh_mac_cache_free(const struct ctx *c, const union inany_addr *addr) > +{ > + ssize_t idx = fwd_mac_cache_slot_idx(c, addr); > + struct mac_cache_table *t = &mac_cache; > + struct mac_cache_entry *e, **prev; > + > + prev = &t->slots[idx]; > + e = t->slots[idx]; > + while (e && !inany_equals(&e->addr, addr)) { > + prev = &e->next; > + e = e->next; > + } > + if (!e) > + return; > + > + *prev = e->next; > + e->next = t->free; > + t->free = e; > + memset(&e->addr, 0, sizeof(*addr)); > + memset(e->mac, 0, ETH_ALEN); > +} > + > +/** > + * fwd_neigh_mac_get() - Lookup MAC address in the ARP/NDP cache table > + * @c: Execution context > + * @addr: Neighbour address used as lookup key > + * @mac: Buffer for Ethernet MAC to return, found or default value. > + * > + * Return: true if real MAC found, false if not found or if failure > + */ > +bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr, > + uint8_t *mac) > +{ > + struct mac_cache_entry *e = fwd_mac_cache_find(c, addr); > + > + if (e) > + memcpy(mac, e->mac, ETH_ALEN); > + else > + memcpy(mac, c->our_tap_mac, ETH_ALEN); > + > + return !!e; > +} > + > +/** > + * fwd_neigh_cache_init() - Initialize the neighbor ARP/NDP cache table > + */ > +void fwd_neigh_cache_init(void) > +{ > + struct mac_cache_table *t = &mac_cache; > + struct mac_cache_entry *e; > + > + memset(t, 0, sizeof(*t)); > + for (int i = 0; i < MAC_CACHE_SIZE; i++) { > + e = &t->entries[i]; > + e->next = t->free; > + t->free = e; > + } > +} > > /** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral > * > diff --git a/fwd.h b/fwd.h > index 65c7c96..a0e8fbc 100644 > --- a/fwd.h > +++ b/fwd.h > @@ -56,5 +56,12 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto, > const struct flowside *ini, struct flowside *tgt); > uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto, > const struct flowside *ini, struct flowside *tgt); > +void fwd_neigh_mac_cache_alloc(const struct ctx *c, > + const union inany_addr *addr, uint8_t *mac); > +void fwd_neigh_mac_cache_free(const struct ctx *c, > + const union inany_addr *addr); > +bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr, > + uint8_t *mac); > +void fwd_neigh_cache_init(void); > > #endif /* FWD_H */ > diff --git a/netlink.c b/netlink.c > index 1faf3da..1e1ec43 100644 > --- a/netlink.c > +++ b/netlink.c > @@ -131,6 +131,8 @@ int nl_neigh_subscr_init(struct ctx *c) > */ > void nl_neigh_subscr_handler(struct ctx *c) > { > + union inany_addr addr; > + uint8_t mac[ETH_ALEN]; > struct nlmsghdr *nh; > char buf[NLBUFSIZ]; > ssize_t n; > @@ -183,17 +185,20 @@ void nl_neigh_subscr_handler(struct ctx *c) > dstlen != sizeof(struct in6_addr)) > continue; > > - char abuf[INET6_ADDRSTRLEN]; > + if (!lladdr || lladdr_len != ETH_ALEN) > + continue; > + > + memcpy(mac, lladdr, ETH_ALEN); > > if (dstlen == sizeof(struct in_addr)) > - inet_ntop(AF_INET, dst, abuf, sizeof(abuf)); > + addr = inany_from_v4(*(struct in_addr *) dst); > else > - inet_ntop(AF_INET6, dst, abuf, sizeof(abuf)); > + addr.a6 = *(struct in6_addr *) dst; > > if (nh->nlmsg_type == RTM_NEWNEIGH) > - debug("neigh: NEW %s lladdr_len=%zu", abuf, lladdr_len); > + fwd_neigh_mac_cache_alloc(c, &addr, mac); > else > - debug("neigh: DEL %s", abuf); > + fwd_neigh_mac_cache_free(c, &addr); I think the debug() messages could still be useful. They could move into the functions of course, and might want to be demoted to trace(). -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson