public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: David Gibson <david@gibson.dropbear.id.au>
To: Jon Maloy <jmaloy@redhat.com>
Cc: sbrivio@redhat.com, dgibson@redhat.com, passt-dev@passt.top
Subject: Re: [PATCH v4 9/9] fwd: Added cache table for ARP/NDP contents
Date: Thu, 21 Aug 2025 12:03:47 +1000	[thread overview]
Message-ID: <aKZ-gzEpPNGPj93j@zatzit> (raw)
In-Reply-To: <20250820031005.2725591-10-jmaloy@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 12417 bytes --]

On Tue, Aug 19, 2025 at 11:10:05PM -0400, Jon Maloy wrote:
> We add a cache table to keep partial contents of the kernel ARP/NDP
> tables. This way, we drastically reduce the number of netlink calls
> to read those tables.

Do you have data to suggest this is necessary?

It's a lot of code to optimise something only needed for some pretty
uncommon cases.

> We create dummy cache entries representing non-(not-yet)-existing
> ARP/NDP entries when needed. We add a short expiration time to each
> such entry, so that we can know when to make repeated calls to the
> kernel tables in the beginning. We also add an access counter to the
> entries, to ensure that the timer becomes longer and the call frequency
> abates over time if no ARP/NDP entry is created.
> 
> For regular entries we use a much longer timer, with the purpose to
> update the entry in the rare case that a remote host changes its
> MAC address.
> 
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> ---
>  arp.c  |   3 +-
>  conf.c |   2 +
>  flow.c |   5 +-
>  fwd.c  | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fwd.h  |   4 ++
>  ndp.c  |   3 +-
>  tcp.c  |   3 +-
>  7 files changed, 218 insertions(+), 8 deletions(-)
> 
> diff --git a/arp.c b/arp.c
> index c37867a..040d4fe 100644
> --- a/arp.c
> +++ b/arp.c
> @@ -29,7 +29,6 @@
>  #include "dhcp.h"
>  #include "passt.h"
>  #include "tap.h"
> -#include "netlink.h"
>  
>  /**
>   * arp() - Check if this is a supported ARP message, reply as needed
> @@ -79,7 +78,7 @@ int arp(const struct ctx *c, const struct pool *p)
>  	 */
>  	inany_from_af(&tgt, AF_INET, am->tip);
>  	if (!fwd_inany_nat(c, &tgt))
> -		nl_neigh_mac_get(nl_sock, &tgt, c->ifi4, am->sha);
> +		fwd_neigh_mac_get(c, &tgt, c->ifi4, am->sha);
>  
>  	memcpy(swap,		am->tip,	sizeof(am->tip));
>  	memcpy(am->tip,		am->sip,	sizeof(am->tip));
> diff --git a/conf.c b/conf.c
> index f47f48e..0abdbf7 100644
> --- a/conf.c
> +++ b/conf.c
> @@ -2122,6 +2122,8 @@ void conf(struct ctx *c, int argc, char **argv)
>  		c->udp.fwd_out.mode = fwd_default;
>  
>  	fwd_scan_ports_init(c);
> +	if (fwd_mac_cache_init())
> +		die("Failed to initiate neighnor MAC cache");

"neighnor"

>  	if (!c->quiet)
>  		conf_print(c);
> diff --git a/flow.c b/flow.c
> index d7b3fd1..3ca30f1 100644
> --- a/flow.c
> +++ b/flow.c
> @@ -459,7 +459,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
>  		 */
>  		ifi = inany_v4(&ini->oaddr) ? c->ifi4 : c->ifi6;
>  		if (!fwd_inany_nat(c, &ini->oaddr))
> -			nl_neigh_mac_get(nl_sock, &ini->oaddr, ifi, f->tap_omac);
> +			fwd_neigh_mac_get(c, &ini->oaddr, ifi, f->tap_omac);
>  		break;
>  
>  	case PIF_SPLICE:
> @@ -474,7 +474,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
>  		 */
>  		ifi = inany_v4(&ini->eaddr) ? c->ifi4 : c->ifi6;
>  		if (!fwd_inany_nat(c, &ini->eaddr))
> -			nl_neigh_mac_get(nl_sock, &ini->eaddr, ifi, f->tap_omac);
> +			fwd_neigh_mac_get(c, &ini->eaddr, ifi, f->tap_omac);
>  		break;
>  
>  	default:
> @@ -491,6 +491,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
>  
>  	f->pif[TGTSIDE] = tgtpif;
>  	flow_set_state(f, FLOW_STATE_TGT);
> +
>  	return tgt;
>  }
>  
> diff --git a/fwd.c b/fwd.c
> index 55bf5f2..6647764 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -19,6 +19,8 @@
>  #include <sched.h>
>  #include <unistd.h>
>  #include <stdio.h>
> +#include <time.h>
> +#include <netinet/if_ether.h>
>  
>  #include "util.h"
>  #include "ip.h"
> @@ -26,6 +28,8 @@
>  #include "passt.h"
>  #include "lineread.h"
>  #include "flow_table.h"
> +#include "inany.h"
> +#include "netlink.h"
>  
>  /* Empheral port range: values from RFC 6335 */
>  static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
> @@ -33,6 +37,208 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
>  
>  #define PORT_RANGE_SYSCTL	"/proc/sys/net/ipv4/ip_local_port_range"
>  
> +#define MAC_CACHE_BUCKETS 1024
> +#define MAC_CACHE_RENEWAL 3600  /* Refresh entry from ARP/NDP every hour */
> +
> +/* Partial cache of ARP/NDP table contents */
> +struct mac_cache_entry {
> +	union inany_addr key;
> +	unsigned char mac[ETH_ALEN];
> +	struct timespec expiry;
> +	uint32_t count;
> +	struct mac_cache_entry *next;
> +};
> +
> +struct mac_cache_table {
> +	struct mac_cache_entry **buckets;
> +	size_t nbuckets;
> +};
> +
> +static struct mac_cache_table mac_cache;
> +
> +/**
> + * timespec_before() - Check the relation between two pints in time
> + * @a:		Point in time to be tested
> + * @b:		Point in time test a against
> + * Return: True if a comes before b, otherwise b
> + */
> +static inline bool timespec_before(const struct timespec *a,
> +				   const struct timespec *b)
> +{
> +	return (a->tv_sec < b->tv_sec) ||
> +	       (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
> +}
> +
> +/**
> + * mac_entry_is_dummy() - Check if a cache entry is a placeholder
> + * @c:		Execution context
> + * @e:		Cache entry
> + *
> + * Return: True if the entry is a placeholder, false otherwise
> + */
> +bool mac_entry_is_dummy(const struct ctx *c, const struct mac_cache_entry *e)
> +{
> +	return !memcmp(c->our_tap_mac, e->mac, ETH_ALEN);
> +}
> +
> +/**
> + * mac_entry_expired() - Check if a cache entry has expired
> + * @e:		Cache entry
> + *
> + * Return: True if the entry has expired, false otherwise
> + */
> +static bool mac_entry_expired(const struct mac_cache_entry *e)
> +{
> +	struct timespec now;
> +
> +	clock_gettime(CLOCK_MONOTONIC, &now);
> +	return timespec_before(&e->expiry, &now);
> +}
> +
> +/**
> + * mac_entry_set_expiry() - Set the time for a cache entry to expire
> + * @e:		Cache entry
> + * @expiry:	Expiration time, in seconds from current moment.
> + *
> + * Return: The result of the hash
> + */
> +static void mac_entry_set_expiry(struct mac_cache_entry *e, int expiry)
> +{
> +	clock_gettime(CLOCK_MONOTONIC, &e->expiry);
> +	e->expiry.tv_sec += expiry;
> +}
> +
> +/**
> + * inany_hash32() - Hash the contenst of key into an integer
> + * @key:	IPv4 or IPv6 address, used as key
> + *
> + * Return: The result of the hash
> + */
> +static inline uint32_t inany_hash32(const struct ctx *c,
> +				    const union inany_addr *key)
> +{
> +	struct siphash_state st = SIPHASH_INIT(c->hash_secret);
> +
> +	inany_siphash_feed(&st, key);
> +
> +	return (uint32_t)siphash_final(&st, sizeof(*key), 0);
> +}
> +
> +/**
> + * fwd_mac_cache_bucket_idx() - Find the table index of an entry
> + * @c:		Execution context
> + * @key:	IPv4 or IPv6 address, used as key for the hash lookup
> + * @nbuckets:	Number of buckets in the table
> + *
> + * Return: The index found
> + */
> +static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c,
> +					      const union inany_addr *key,
> +					      size_t nbuckets)
> +{
> +	uint32_t h = inany_hash32(c, key);
> +
> +	return (nbuckets & (nbuckets - 1)) ? (h % nbuckets) : (h & (nbuckets - 1));

I don't think this is worth the complexity: I wouldn't be confident
the conditional branch is less expensive than an integer divmod.  And
if it is, it wouldn't suprise me if the compiler did that optimisation
itself.

> +}
> +
> +/**
> + * fwd_mac_cache_find() - Find an entry in the ARP/NDP cache table
> + * @c:		Execution context
> + * @key:	IPv4 or IPv6 address, used as key for the hash lookup
> + *
> + * Return: Pointer to the entry on success, NULL on failure.
> + */
> +static struct mac_cache_entry *fwd_mac_cache_find(const struct ctx *c,
> +						  const union inany_addr *key)
> +{
> +	const struct mac_cache_table *t = &mac_cache;
> +	struct mac_cache_entry *e;
> +	size_t idx;
> +
> +	idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
> +	for (e = t->buckets[idx]; e; e = e->next)
> +		if (inany_equals(&e->key, key))
> +			return e;
> +	return NULL;
> +}
> +
> +/**
> + * fwd_mac_cache_add() - Add a new entry to the ARP/NDP cache table
> + * @c:		Execution context
> + * @key:	IPv4 or IPv6 address, used as key for the hash lookup
> + * @mac:	Buffer for Ethernet MAC, left unchanged if not found/usable
> + *
> + * Return: Pointer to the new entry on success, NULL on failure.
> + */
> +static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c,
> +						 const union inany_addr *key,
> +						 const unsigned char *mac)
> +{
> +	struct mac_cache_table *t = &mac_cache;
> +	size_t idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
> +	struct mac_cache_entry *e;
> +
> +	e = calloc(1, sizeof(*e));
> +	if (!e)
> +		return NULL;
> +	e->key = *key;
> +	memcpy(e->mac, mac, ETH_ALEN);
> +	e->count = 0;
> +	e->next = t->buckets[idx];
> +	t->buckets[idx] = e;
> +
> +	return e;
> +}
> +
> +/**
> + * fwd_neigh_mac_get() - Find a MAC address the ARP/NDP cache table
> + * @c:		Execution context
> + * @addr:	IPv4 or IPv6 address
> + * @ifi:	Interface index
> + * @mac:	Buffer for Ethernet MAC, left unchanged if not found/usable
> + *
> + * Return: 0 on success, -1 on failure.
> + */
> +int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> +		      int ifi, unsigned char *mac)
> +{
> +	struct mac_cache_entry *e = fwd_mac_cache_find(c, addr);
> +	bool refresh = false;
> +
> +	if (e)
> +		refresh = mac_entry_expired(e);
> +	else if ((e = fwd_mac_cache_add(c, addr, mac)))
> +		refresh = true;
> +	else
> +		return -1;
> +
> +	if (refresh) {
> +		nl_neigh_mac_get(nl_sock, addr, ifi, e->mac);
> +		if (mac_entry_is_dummy(c, e))
> +			mac_entry_set_expiry(e, e->count++);
> +		else
> +			mac_entry_set_expiry(e, MAC_CACHE_RENEWAL);
> +	}
> +	memcpy(mac, e->mac, ETH_ALEN);
> +
> +	return 0;
> +}
> +
> +/**
> + * fwd_mac_cache_init() - Initiate ARP/NDP cache table
> + *
> + * Return: 0 on success, -1 on failure.
> + */
> +int fwd_mac_cache_init(void)
> +{
> +	struct mac_cache_table *t = &mac_cache;
> +
> +	t->nbuckets = MAC_CACHE_BUCKETS;
> +	t->buckets = calloc(t->nbuckets, sizeof(*t->buckets));
> +
> +	return t->buckets ? 0 : -1;
> +}
> +
>  /** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
>   *
>   * Work out what ports the host thinks are emphemeral and record it for later
> diff --git a/fwd.h b/fwd.h
> index c8d485d..ebfc114 100644
> --- a/fwd.h
> +++ b/fwd.h
> @@ -58,4 +58,8 @@ uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
>  			  const struct flowside *ini, struct flowside *tgt);
>  bool fwd_inany_nat(const struct ctx *c, const union inany_addr *addr);
>  
> +int fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> +		      int ifi, unsigned char *mac);
> +int fwd_mac_cache_init(void);
> +
>  #endif /* FWD_H */
> diff --git a/ndp.c b/ndp.c
> index 19b9b28..af957c9 100644
> --- a/ndp.c
> +++ b/ndp.c
> @@ -32,7 +32,6 @@
>  #include "passt.h"
>  #include "tap.h"
>  #include "log.h"
> -#include "netlink.h"
>  
>  #define	RT_LIFETIME	65535
>  
> @@ -222,7 +221,7 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
>  	 */
>  	inany_from_af(&tgt, AF_INET6, addr);
>  	if (!fwd_inany_nat(c, &tgt))
> -		nl_neigh_mac_get(nl_sock, &tgt, c->ifi6, na.target_l2_addr.mac);
> +		fwd_neigh_mac_get(c, &tgt, c->ifi6, na.target_l2_addr.mac);
>  
>  	ndp_send(c, dst, &na, sizeof(na));
>  }
> diff --git a/tcp.c b/tcp.c
> index 28f9ef5..6b5b4ff 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -309,7 +309,6 @@
>  #include "tcp_internal.h"
>  #include "tcp_buf.h"
>  #include "tcp_vu.h"
> -#include "netlink.h"
>  
>  #ifndef __USE_MISC
>  /* From Linux UAPI, missing in netinet/tcp.h provided by musl */
> @@ -1908,7 +1907,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
>  	memcpy(src_mac, c->our_tap_mac, ETH_ALEN);
>  	inany_from_af(&tgt, af, daddr);
>  	if (!fwd_inany_nat(c, &tgt))
> -		nl_neigh_mac_get(nl_sock, &tgt, ifi, src_mac);
> +		fwd_neigh_mac_get(c, &tgt, ifi, src_mac);
>  
>  	if (af == AF_INET) {
>  		struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);

-- 
David Gibson (he or they)	| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you, not the other way
				| around.
http://www.ozlabs.org/~dgibson

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

  reply	other threads:[~2025-08-21  2:04 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-08-20  3:09 [PATCH v4 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
2025-08-20  3:09 ` [PATCH v4 1/9] netlink: add function to extract MAC addresses from NDP/ARP table Jon Maloy
2025-08-21  0:57   ` David Gibson
2025-08-20  3:09 ` [PATCH v4 2/9] arp/ndp: respond with true MAC address of LAN local remote hosts Jon Maloy
2025-08-21  1:18   ` David Gibson
2025-08-20  3:09 ` [PATCH v4 3/9] flow: add MAC address of LAN local remote hosts to flow Jon Maloy
2025-08-21  1:28   ` David Gibson
2025-08-20  3:10 ` [PATCH v4 4/9] udp: forward external source MAC address through tap interface Jon Maloy
2025-08-21  1:32   ` David Gibson
2025-08-20  3:10 ` [PATCH v4 5/9] tcp: " Jon Maloy
2025-08-21  1:37   ` David Gibson
2025-08-20  3:10 ` [PATCH v4 6/9] tap: change signature of function tap_push_l2h() Jon Maloy
2025-08-21  1:39   ` David Gibson
2025-08-20  3:10 ` [PATCH v4 7/9] tcp: make tcp_rst_no_conn() respond with correct MAC address Jon Maloy
2025-08-21  1:46   ` David Gibson
2025-08-20  3:10 ` [PATCH v4 8/9] icmp: let icmp use mac address from flowside structure Jon Maloy
2025-08-21  1:51   ` David Gibson
2025-08-20  3:10 ` [PATCH v4 9/9] fwd: Added cache table for ARP/NDP contents Jon Maloy
2025-08-21  2:03   ` David Gibson [this message]
2025-08-21 10:53     ` Stefano Brivio
2025-08-25  1:48       ` David Gibson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aKZ-gzEpPNGPj93j@zatzit \
    --to=david@gibson.dropbear.id.au \
    --cc=dgibson@redhat.com \
    --cc=jmaloy@redhat.com \
    --cc=passt-dev@passt.top \
    --cc=sbrivio@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).