From: David Gibson <david@gibson.dropbear.id.au>
To: Jon Maloy <jmaloy@redhat.com>
Cc: sbrivio@redhat.com, dgibson@redhat.com, passt-dev@passt.top
Subject: Re: [PATCH v4 9/9] fwd: Added cache table for ARP/NDP contents
Date: Thu, 21 Aug 2025 12:03:47 +1000 [thread overview]
Message-ID: <aKZ-gzEpPNGPj93j@zatzit> (raw)
In-Reply-To: <20250820031005.2725591-10-jmaloy@redhat.com>
[-- Attachment #1: Type: text/plain, Size: 12417 bytes --]
On Tue, Aug 19, 2025 at 11:10:05PM -0400, Jon Maloy wrote:
> We add a cache table to keep partial contents of the kernel ARP/NDP
> tables. This way, we drastically reduce the number of netlink calls
> to read those tables.
Do you have data to suggest this is necessary?
It's a lot of code to optimise something only needed for some pretty
uncommon cases.
> We create dummy cache entries representing non-(not-yet)-existing
> ARP/NDP entries when needed. We add a short expiration time to each
> such entry, so that we can know when to make repeated calls to the
> kernel tables in the beginning. We also add an access counter to the
> entries, to ensure that the timer becomes longer and the call frequency
> abates over time if no ARP/NDP entry is created.
>
> For regular entries we use a much longer timer, with the purpose to
> update the entry in the rare case that a remote host changes its
> MAC address.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
> arp.c | 3 +-
> conf.c | 2 +
> flow.c | 5 +-
> fwd.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fwd.h | 4 ++
> ndp.c | 3 +-
> tcp.c | 3 +-
> 7 files changed, 218 insertions(+), 8 deletions(-)
>
> diff --git a/arp.c b/arp.c
> index c37867a..040d4fe 100644
> --- a/arp.c
> +++ b/arp.c
> @@ -29,7 +29,6 @@
> #include "dhcp.h"
> #include "passt.h"
> #include "tap.h"
> -#include "netlink.h"
>
> /**
> * arp() - Check if this is a supported ARP message, reply as needed
> @@ -79,7 +78,7 @@ int arp(const struct ctx *c, const struct pool *p)
> */
> inany_from_af(&tgt, AF_INET, am->tip);
> if (!fwd_inany_nat(c, &tgt))
> - nl_neigh_mac_get(nl_sock, &tgt, c->ifi4, am->sha);
> + fwd_neigh_mac_get(c, &tgt, c->ifi4, am->sha);
>
> memcpy(swap, am->tip, sizeof(am->tip));
> memcpy(am->tip, am->sip, sizeof(am->tip));
> diff --git a/conf.c b/conf.c
> index f47f48e..0abdbf7 100644
> --- a/conf.c
> +++ b/conf.c
> @@ -2122,6 +2122,8 @@ void conf(struct ctx *c, int argc, char **argv)
> c->udp.fwd_out.mode = fwd_default;
>
> fwd_scan_ports_init(c);
> + if (fwd_mac_cache_init())
> + die("Failed to initiate neighnor MAC cache");
"neighnor"
> if (!c->quiet)
> conf_print(c);
> diff --git a/flow.c b/flow.c
> index d7b3fd1..3ca30f1 100644
> --- a/flow.c
> +++ b/flow.c
> @@ -459,7 +459,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
> */
> ifi = inany_v4(&ini->oaddr) ? c->ifi4 : c->ifi6;
> if (!fwd_inany_nat(c, &ini->oaddr))
> - nl_neigh_mac_get(nl_sock, &ini->oaddr, ifi, f->tap_omac);
> + fwd_neigh_mac_get(c, &ini->oaddr, ifi, f->tap_omac);
> break;
>
> case PIF_SPLICE:
> @@ -474,7 +474,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
> */
> ifi = inany_v4(&ini->eaddr) ? c->ifi4 : c->ifi6;
> if (!fwd_inany_nat(c, &ini->eaddr))
> - nl_neigh_mac_get(nl_sock, &ini->eaddr, ifi, f->tap_omac);
> + fwd_neigh_mac_get(c, &ini->eaddr, ifi, f->tap_omac);
> break;
>
> default:
> @@ -491,6 +491,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
>
> f->pif[TGTSIDE] = tgtpif;
> flow_set_state(f, FLOW_STATE_TGT);
> +
> return tgt;
> }
>
> diff --git a/fwd.c b/fwd.c
> index 55bf5f2..6647764 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -19,6 +19,8 @@
> #include <sched.h>
> #include <unistd.h>
> #include <stdio.h>
> +#include <time.h>
> +#include <netinet/if_ether.h>
>
> #include "util.h"
> #include "ip.h"
> @@ -26,6 +28,8 @@
> #include "passt.h"
> #include "lineread.h"
> #include "flow_table.h"
> +#include "inany.h"
> +#include "netlink.h"
>
> /* Empheral port range: values from RFC 6335 */
> static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
> @@ -33,6 +37,208 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
>
> #define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
>
> +#define MAC_CACHE_BUCKETS 1024
> +#define MAC_CACHE_RENEWAL 3600 /* Refresh entry from ARP/NDP every hour */
> +
> +/* Partial cache of ARP/NDP table contents */
> +struct mac_cache_entry {
> + union inany_addr key;
> + unsigned char mac[ETH_ALEN];
> + struct timespec expiry;
> + uint32_t count;
> + struct mac_cache_entry *next;
> +};
> +
> +struct mac_cache_table {
> + struct mac_cache_entry **buckets;
> + size_t nbuckets;
> +};
> +
> +static struct mac_cache_table mac_cache;
> +
> +/**
> + * timespec_before() - Check the relation between two pints in time
> + * @a: Point in time to be tested
> + * @b: Point in time test a against
> + * Return: True if a comes before b, otherwise b
> + */
> +static inline bool timespec_before(const struct timespec *a,
> + const struct timespec *b)
> +{
> + return (a->tv_sec < b->tv_sec) ||
> + (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
> +}
> +
> +/**
> + * mac_entry_is_dummy() - Check if a cache entry is a placeholder
> + * @c: Execution context
> + * @e: Cache entry
> + *
> + * Return: True if the entry is a placeholder, false otherwise
> + */
> +bool mac_entry_is_dummy(const struct ctx *c, const struct mac_cache_entry *e)
> +{
> + return !memcmp(c->our_tap_mac, e->mac, ETH_ALEN);
> +}
> +
> +/**
> + * mac_entry_expired() - Check if a cache entry has expired
> + * @e: Cache entry
> + *
> + * Return: True if the entry has expired, false otherwise
> + */
> +static bool mac_entry_expired(const struct mac_cache_entry *e)
> +{
> + struct timespec now;
> +
> + clock_gettime(CLOCK_MONOTONIC, &now);
> + return timespec_before(&e->expiry, &now);
> +}
> +
> +/**
> + * mac_entry_set_expiry() - Set the time for a cache entry to expire
> + * @e: Cache entry
> + * @expiry: Expiration time, in seconds from current moment.
> + *
> + * Return: The result of the hash
> + */
> +static void mac_entry_set_expiry(struct mac_cache_entry *e, int expiry)
> +{
> + clock_gettime(CLOCK_MONOTONIC, &e->expiry);
> + e->expiry.tv_sec += expiry;
> +}
> +
> +/**
> + * inany_hash32() - Hash the contenst of key into an integer
> + * @key: IPv4 or IPv6 address, used as key
> + *
> + * Return: The result of the hash
> + */
> +static inline uint32_t inany_hash32(const struct ctx *c,
> + const union inany_addr *key)
> +{
> + struct siphash_state st = SIPHASH_INIT(c->hash_secret);
> +
> + inany_siphash_feed(&st, key);
> +
> + return (uint32_t)siphash_final(&st, sizeof(*key), 0);
> +}
> +
> +/**
> + * fwd_mac_cache_bucket_idx() - Find the table index of an entry
> + * @c: Execution context
> + * @key: IPv4 or IPv6 address, used as key for the hash lookup
> + * @nbuckets: Number of buckets in the table
> + *
> + * Return: The index found
> + */
> +static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c,
> + const union inany_addr *key,
> + size_t nbuckets)
> +{
> + uint32_t h = inany_hash32(c, key);
> +
> + return (nbuckets & (nbuckets - 1)) ? (h % nbuckets) : (h & (nbuckets - 1));
I don't think this is worth the complexity: I wouldn't be confident
the conditional branch is less expensive than an integer divmod. And
if it is, it wouldn't suprise me if the compiler did that optimisation
itself.
> +}
> +
> +/**
> + * fwd_mac_cache_find() - Find an entry in the ARP/NDP cache table
> + * @c: Execution context
> + * @key: IPv4 or IPv6 address, used as key for the hash lookup
> + *
> + * Return: Pointer to the entry on success, NULL on failure.
> + */
> +static struct mac_cache_entry *fwd_mac_cache_find(const struct ctx *c,
> + const union inany_addr *key)
> +{
> + const struct mac_cache_table *t = &mac_cache;
> + struct mac_cache_entry *e;
> + size_t idx;
> +
> + idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
> + for (e = t->buckets[idx]; e; e = e->next)
> + if (inany_equals(&e->key, key))
> + return e;
> + return NULL;
> +}
> +
> +/**
> + * fwd_mac_cache_add() - Add a new entry to the ARP/NDP cache table
> + * @c: Execution context
> + * @key: IPv4 or IPv6 address, used as key for the hash lookup
> + * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable
> + *
> + * Return: Pointer to the new entry on success, NULL on failure.
> + */
> +static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c,
> + const union inany_addr *key,
> + const unsigned char *mac)
> +{
> + struct mac_cache_table *t = &mac_cache;
> + size_t idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
> + struct mac_cache_entry *e;
> +
> + e = calloc(1, sizeof(*e));
> + if (!e)
> + return NULL;
> + e->key = *key;
> + memcpy(e->mac, mac, ETH_ALEN);
> + e->count = 0;
> + e->next = t->buckets[idx];
> + t->buckets[idx] = e;
> +
> + return e;
> +}
> +
> +/**
> + * fwd_neigh_mac_get() - Find a MAC address the ARP/NDP cache table
> + * @c: Execution context
> + * @addr: IPv4 or IPv6 address
> + * @ifi: Interface index
> + * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable
> + *
> + * Return: 0 on success, -1 on failure.
> + */
> +int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> + int ifi, unsigned char *mac)
> +{
> + struct mac_cache_entry *e = fwd_mac_cache_find(c, addr);
> + bool refresh = false;
> +
> + if (e)
> + refresh = mac_entry_expired(e);
> + else if ((e = fwd_mac_cache_add(c, addr, mac)))
> + refresh = true;
> + else
> + return -1;
> +
> + if (refresh) {
> + nl_neigh_mac_get(nl_sock, addr, ifi, e->mac);
> + if (mac_entry_is_dummy(c, e))
> + mac_entry_set_expiry(e, e->count++);
> + else
> + mac_entry_set_expiry(e, MAC_CACHE_RENEWAL);
> + }
> + memcpy(mac, e->mac, ETH_ALEN);
> +
> + return 0;
> +}
> +
> +/**
> + * fwd_mac_cache_init() - Initiate ARP/NDP cache table
> + *
> + * Return: 0 on success, -1 on failure.
> + */
> +int fwd_mac_cache_init(void)
> +{
> + struct mac_cache_table *t = &mac_cache;
> +
> + t->nbuckets = MAC_CACHE_BUCKETS;
> + t->buckets = calloc(t->nbuckets, sizeof(*t->buckets));
> +
> + return t->buckets ? 0 : -1;
> +}
> +
> /** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
> *
> * Work out what ports the host thinks are emphemeral and record it for later
> diff --git a/fwd.h b/fwd.h
> index c8d485d..ebfc114 100644
> --- a/fwd.h
> +++ b/fwd.h
> @@ -58,4 +58,8 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
> const struct flowside *ini, struct flowside *tgt);
> bool fwd_inany_nat(const struct ctx *c, const union inany_addr *addr);
>
> +int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> + int ifi, unsigned char *mac);
> +int fwd_mac_cache_init(void);
> +
> #endif /* FWD_H */
> diff --git a/ndp.c b/ndp.c
> index 19b9b28..af957c9 100644
> --- a/ndp.c
> +++ b/ndp.c
> @@ -32,7 +32,6 @@
> #include "passt.h"
> #include "tap.h"
> #include "log.h"
> -#include "netlink.h"
>
> #define RT_LIFETIME 65535
>
> @@ -222,7 +221,7 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
> */
> inany_from_af(&tgt, AF_INET6, addr);
> if (!fwd_inany_nat(c, &tgt))
> - nl_neigh_mac_get(nl_sock, &tgt, c->ifi6, na.target_l2_addr.mac);
> + fwd_neigh_mac_get(c, &tgt, c->ifi6, na.target_l2_addr.mac);
>
> ndp_send(c, dst, &na, sizeof(na));
> }
> diff --git a/tcp.c b/tcp.c
> index 28f9ef5..6b5b4ff 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -309,7 +309,6 @@
> #include "tcp_internal.h"
> #include "tcp_buf.h"
> #include "tcp_vu.h"
> -#include "netlink.h"
>
> #ifndef __USE_MISC
> /* From Linux UAPI, missing in netinet/tcp.h provided by musl */
> @@ -1908,7 +1907,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
> memcpy(src_mac, c->our_tap_mac, ETH_ALEN);
> inany_from_af(&tgt, af, daddr);
> if (!fwd_inany_nat(c, &tgt))
> - nl_neigh_mac_get(nl_sock, &tgt, ifi, src_mac);
> + fwd_neigh_mac_get(c, &tgt, ifi, src_mac);
>
> if (af == AF_INET) {
> struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
next prev parent reply other threads:[~2025-08-21 2:04 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-20 3:09 [PATCH v4 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
2025-08-20 3:09 ` [PATCH v4 1/9] netlink: add function to extract MAC addresses from NDP/ARP table Jon Maloy
2025-08-21 0:57 ` David Gibson
2025-08-20 3:09 ` [PATCH v4 2/9] arp/ndp: respond with true MAC address of LAN local remote hosts Jon Maloy
2025-08-21 1:18 ` David Gibson
2025-08-20 3:09 ` [PATCH v4 3/9] flow: add MAC address of LAN local remote hosts to flow Jon Maloy
2025-08-21 1:28 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 4/9] udp: forward external source MAC address through tap interface Jon Maloy
2025-08-21 1:32 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 5/9] tcp: " Jon Maloy
2025-08-21 1:37 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 6/9] tap: change signature of function tap_push_l2h() Jon Maloy
2025-08-21 1:39 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 7/9] tcp: make tcp_rst_no_conn() respond with correct MAC address Jon Maloy
2025-08-21 1:46 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 8/9] icmp: let icmp use mac address from flowside structure Jon Maloy
2025-08-21 1:51 ` David Gibson
2025-08-20 3:10 ` [PATCH v4 9/9] fwd: Added cache table for ARP/NDP contents Jon Maloy
2025-08-21 2:03 ` David Gibson [this message]
2025-08-21 10:53 ` Stefano Brivio
2025-08-25 1:48 ` David Gibson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aKZ-gzEpPNGPj93j@zatzit \
--to=david@gibson.dropbear.id.au \
--cc=dgibson@redhat.com \
--cc=jmaloy@redhat.com \
--cc=passt-dev@passt.top \
--cc=sbrivio@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).