* [PATCH v5 01/10] netlink: add function to extract MAC addresses from NDP/ARP table
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 2:12 ` David Gibson
2025-09-06 2:11 ` [PATCH v5 02/10] fwd: Added cache table for ARP/NDP contents Jon Maloy
` (8 subsequent siblings)
9 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
The solution to bug https://bugs.passt.top/show_bug.cgi?id=120
requires the ability to translate from an IP address to its
corresponding MAC address in cases where those are present in
the ARP/NDP table.
We add this feature here.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v3: - Added an attribute contianing NDA_DST to sent message, so
that we let the kernel do the filtering of the IP address
and return only one entry.
- Added interface index to the call signature. Since the only
interface we know is the template interface, this limits
the number of hosts that will be seen as 'network segment
local' from a PASST viewpoint.
v4: - Made loop independent of attribute order.
- Ignoring L2 addresses which are not of size ETH_ALEN.
v5: - Changed return value of new function, so caller can know if
a MAC address really was found.
---
netlink.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
netlink.h | 2 ++
2 files changed, 86 insertions(+)
diff --git a/netlink.c b/netlink.c
index 8f82e73..1ca2c9a 100644
--- a/netlink.c
+++ b/netlink.c
@@ -800,6 +800,90 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
return status;
}
+/**
+ * nl_neigh_mac_get() - Get neighbor MAC address from the kernel neigh table
+ * @s: Netlink socket fd
+ * @addr: IPv4 or IPv6 address
+ * @ifi: Interface index
+ * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable
+ *
+ * Return: true if a valid address was found, false otherwise.
+ */
+bool nl_neigh_mac_get(int s, const union inany_addr *addr,
+ int ifi, unsigned char *mac)
+{
+ const void *ip = inany_v4(addr);
+ struct req_t {
+ struct nlmsghdr nlh;
+ struct ndmsg ndm;
+ struct rtattr rta;
+ char ip[RTA_ALIGN(sizeof(struct in6_addr))];
+ } req;
+ struct nlmsghdr *nh;
+ char buf[NLBUFSIZ];
+ bool found = false;
+ ssize_t status;
+ uint32_t seq;
+ int msglen;
+ int iplen;
+
+ memset(&req, 0, sizeof(req));
+ req.ndm.ndm_ifindex = ifi;
+ req.rta.rta_type = NDA_DST;
+
+ if (ip) {
+ req.ndm.ndm_family = AF_INET;
+ iplen = sizeof(struct in_addr);
+ } else {
+ req.ndm.ndm_family = AF_INET6;
+ ip = &addr;
+ iplen = sizeof(struct in6_addr);
+ }
+
+ req.rta.rta_len = RTA_LENGTH(iplen);
+ memcpy(RTA_DATA(&req.rta), ip, iplen);
+ msglen = NLMSG_ALIGN(sizeof(req.nlh) + sizeof(req.ndm) + RTA_LENGTH(iplen));
+ seq = nl_send(s, &req, RTM_GETNEIGH, 0, msglen);
+
+ /* Drain all RTM_NEWNEIGH replies for this seq */
+ nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWNEIGH) {
+ struct ndmsg *ndm = NLMSG_DATA(nh);
+ struct rtattr *rta = (struct rtattr *)(ndm + 1);
+ const uint8_t *lladdr = NULL;
+ size_t na = RTM_PAYLOAD(nh);
+ const void *dst = NULL;
+ size_t lladdr_len = 0;
+ size_t dstlen = 0;
+
+ for (; RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) {
+ switch (rta->rta_type) {
+ case NDA_DST:
+ dst = RTA_DATA(rta);
+ dstlen = RTA_PAYLOAD(rta);
+ break;
+ case NDA_LLADDR:
+ lladdr = RTA_DATA(rta);
+ lladdr_len = RTA_PAYLOAD(rta);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (dst && dstlen == (size_t)iplen && memcmp(dst, ip, iplen) == 0) {
+ /* Only copy Ethernet-style addresses; leave unchanged otherwise */
+ if (lladdr && lladdr_len == ETH_ALEN) {
+ memcpy(mac, lladdr, ETH_ALEN);
+ found = true;
+ }
+ }
+ }
+ if (status < 0)
+ warn("netlink: RTM_NEWNEIGH failed: %s", strerror_(-status));
+
+ return found;
+}
+
/**
* nl_addr_get_ll() - Get first IPv6 link-local address for a given interface
* @s: Netlink socket
diff --git a/netlink.h b/netlink.h
index b51e99c..1dbe1db 100644
--- a/netlink.h
+++ b/netlink.h
@@ -17,6 +17,8 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
int s_dst, unsigned int ifi_dst, sa_family_t af);
int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
void *addr, int *prefix_len, void *addr_l);
+bool nl_neigh_mac_get(int s, const union inany_addr *addr, int ifi,
+ unsigned char *mac);
int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
const void *addr, int prefix_len);
int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
--
@@ -17,6 +17,8 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
int s_dst, unsigned int ifi_dst, sa_family_t af);
int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
void *addr, int *prefix_len, void *addr_l);
+bool nl_neigh_mac_get(int s, const union inany_addr *addr, int ifi,
+ unsigned char *mac);
int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
const void *addr, int prefix_len);
int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 01/10] netlink: add function to extract MAC addresses from NDP/ARP table
2025-09-06 2:11 ` [PATCH v5 01/10] netlink: add function to extract MAC addresses from NDP/ARP table Jon Maloy
@ 2025-09-08 2:12 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 2:12 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 4873 bytes --]
On Fri, Sep 05, 2025 at 10:11:45PM -0400, Jon Maloy wrote:
> The solution to bug https://bugs.passt.top/show_bug.cgi?id=120
> requires the ability to translate from an IP address to its
> corresponding MAC address in cases where those are present in
> the ARP/NDP table.
>
> We add this feature here.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
> v3: - Added an attribute contianing NDA_DST to sent message, so
> that we let the kernel do the filtering of the IP address
> and return only one entry.
> - Added interface index to the call signature. Since the only
> interface we know is the template interface, this limits
> the number of hosts that will be seen as 'network segment
> local' from a PASST viewpoint.
> v4: - Made loop independent of attribute order.
> - Ignoring L2 addresses which are not of size ETH_ALEN.
> v5: - Changed return value of new function, so caller can know if
> a MAC address really was found.
> ---
> netlink.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> netlink.h | 2 ++
> 2 files changed, 86 insertions(+)
>
> diff --git a/netlink.c b/netlink.c
> index 8f82e73..1ca2c9a 100644
> --- a/netlink.c
> +++ b/netlink.c
> @@ -800,6 +800,90 @@ int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
> return status;
> }
>
> +/**
> + * nl_neigh_mac_get() - Get neighbor MAC address from the kernel neigh table
> + * @s: Netlink socket fd
> + * @addr: IPv4 or IPv6 address
> + * @ifi: Interface index
> + * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable
> + *
> + * Return: true if a valid address was found, false otherwise.
> + */
> +bool nl_neigh_mac_get(int s, const union inany_addr *addr,
> + int ifi, unsigned char *mac)
> +{
> + const void *ip = inany_v4(addr);
> + struct req_t {
> + struct nlmsghdr nlh;
> + struct ndmsg ndm;
> + struct rtattr rta;
> + char ip[RTA_ALIGN(sizeof(struct in6_addr))];
> + } req;
> + struct nlmsghdr *nh;
> + char buf[NLBUFSIZ];
> + bool found = false;
> + ssize_t status;
> + uint32_t seq;
> + int msglen;
> + int iplen;
> +
> + memset(&req, 0, sizeof(req));
> + req.ndm.ndm_ifindex = ifi;
> + req.rta.rta_type = NDA_DST;
> +
> + if (ip) {
> + req.ndm.ndm_family = AF_INET;
> + iplen = sizeof(struct in_addr);
> + } else {
> + req.ndm.ndm_family = AF_INET6;
> + ip = &addr;
> + iplen = sizeof(struct in6_addr);
> + }
> +
> + req.rta.rta_len = RTA_LENGTH(iplen);
> + memcpy(RTA_DATA(&req.rta), ip, iplen);
> + msglen = NLMSG_ALIGN(sizeof(req.nlh) + sizeof(req.ndm) + RTA_LENGTH(iplen));
> + seq = nl_send(s, &req, RTM_GETNEIGH, 0, msglen);
> +
> + /* Drain all RTM_NEWNEIGH replies for this seq */
> + nl_foreach_oftype(nh, status, s, buf, seq, RTM_NEWNEIGH) {
> + struct ndmsg *ndm = NLMSG_DATA(nh);
> + struct rtattr *rta = (struct rtattr *)(ndm + 1);
> + const uint8_t *lladdr = NULL;
> + size_t na = RTM_PAYLOAD(nh);
> + const void *dst = NULL;
> + size_t lladdr_len = 0;
> + size_t dstlen = 0;
> +
> + for (; RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) {
> + switch (rta->rta_type) {
> + case NDA_DST:
> + dst = RTA_DATA(rta);
> + dstlen = RTA_PAYLOAD(rta);
> + break;
> + case NDA_LLADDR:
> + lladdr = RTA_DATA(rta);
> + lladdr_len = RTA_PAYLOAD(rta);
> + break;
> + default:
> + break;
> + }
> + }
> +
> + if (dst && dstlen == (size_t)iplen && memcmp(dst, ip, iplen) == 0) {
> + /* Only copy Ethernet-style addresses; leave unchanged otherwise */
> + if (lladdr && lladdr_len == ETH_ALEN) {
> + memcpy(mac, lladdr, ETH_ALEN);
> + found = true;
> + }
> + }
> + }
> + if (status < 0)
> + warn("netlink: RTM_NEWNEIGH failed: %s", strerror_(-status));
> +
> + return found;
> +}
> +
> /**
> * nl_addr_get_ll() - Get first IPv6 link-local address for a given interface
> * @s: Netlink socket
> diff --git a/netlink.h b/netlink.h
> index b51e99c..1dbe1db 100644
> --- a/netlink.h
> +++ b/netlink.h
> @@ -17,6 +17,8 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
> int s_dst, unsigned int ifi_dst, sa_family_t af);
> int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
> void *addr, int *prefix_len, void *addr_l);
> +bool nl_neigh_mac_get(int s, const union inany_addr *addr, int ifi,
> + unsigned char *mac);
> int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
> const void *addr, int prefix_len);
> int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 02/10] fwd: Added cache table for ARP/NDP contents
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
2025-09-06 2:11 ` [PATCH v5 01/10] netlink: add function to extract MAC addresses from NDP/ARP table Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 2:42 ` David Gibson
2025-09-08 9:57 ` David Gibson
2025-09-06 2:11 ` [PATCH v5 03/10] fwd: Add entries of ARP/NDP cache table to a FIFO/LRU queue Jon Maloy
` (7 subsequent siblings)
9 siblings, 2 replies; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
We add a cache table to keep partial contents of the kernel ARP/NDP
tables. This way, we drastically reduce the number of netlink calls
to read those tables.
We create placeholder cache entries representing non- or not-yet-
existing ARP/NDP entries when needed. We add a short expiration time
to each such entry, so that we can know when to make repeated calls to
the kernel tables in the beginning. We also add an access counter to the
entries, to ensure that the timer becomes longer and the call frequency
abates over time if no ARP/NDP entry shows up.
For regular entries we use a much longer timer, with the purpose to
update the entry in the rare case that a remote host changes its
MAC address.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v5: - Moved to earlier in series to reduce rebase conflicts
---
conf.c | 2 +
fwd.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fwd.h | 11 +++
netlink.c | 2 -
4 files changed, 219 insertions(+), 2 deletions(-)
diff --git a/conf.c b/conf.c
index f47f48e..11d9d63 100644
--- a/conf.c
+++ b/conf.c
@@ -2122,6 +2122,8 @@ void conf(struct ctx *c, int argc, char **argv)
c->udp.fwd_out.mode = fwd_default;
fwd_scan_ports_init(c);
+ if (fwd_mac_cache_init())
+ die("Failed to initiate neighbour MAC cache");
if (!c->quiet)
conf_print(c);
diff --git a/fwd.c b/fwd.c
index 250cf56..236e58e 100644
--- a/fwd.c
+++ b/fwd.c
@@ -19,6 +19,8 @@
#include <sched.h>
#include <unistd.h>
#include <stdio.h>
+#include <time.h>
+#include <netinet/if_ether.h>
#include "util.h"
#include "ip.h"
@@ -26,6 +28,8 @@
#include "passt.h"
#include "lineread.h"
#include "flow_table.h"
+#include "inany.h"
+#include "netlink.h"
/* Empheral port range: values from RFC 6335 */
static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
@@ -33,6 +37,208 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
+#define MAC_CACHE_BUCKETS 1024 /* Must be power of two */
+#define MAC_CACHE_RENEWAL 3600 /* Refresh entry from ARP/NDP every hour */
+
+/* Partial cache of ARP/NDP table contents */
+struct mac_cache_entry {
+ union inany_addr key;
+ unsigned char mac[ETH_ALEN];
+ struct timespec expiry;
+ uint32_t count;
+
+ /* Hash bucket chain */
+ struct mac_cache_entry *next;
+};
+
+struct mac_cache_table {
+ struct mac_cache_entry **buckets;
+ size_t nbuckets;
+ size_t size;
+};
+
+static struct mac_cache_table mac_cache;
+const uint8_t undefined_mac[6] = {0, 0, 0, 0, 0, 0};
+
+/**
+ * fwd_mac_cache_bucket_idx() - Find the table index of an entry
+ * @c: Execution context
+ * @key: IPv4 or IPv6 address, used as key for the hash lookup
+ * @nbuckets: Number of buckets in the table
+ *
+ * Return: The index found
+ */
+static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c,
+ const union inany_addr *key,
+ size_t nbuckets)
+{
+ struct siphash_state st = SIPHASH_INIT(c->hash_secret);
+ uint32_t h;
+
+ inany_siphash_feed(&st, key);
+ h = siphash_final(&st, sizeof(*key), 0);
+
+ return ((size_t)h) & (nbuckets - 1);
+}
+
+/**
+ * timespec_before() - Check the relation between two pints in time
+ * @a: Point in time to be tested
+ * @b: Point in time test a against
+ * Return: True if a comes before b, otherwise b
+ */
+static inline bool timespec_before(const struct timespec *a,
+ const struct timespec *b)
+{
+ return (a->tv_sec < b->tv_sec) ||
+ (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
+}
+
+/**
+ * mac_entry_placeholder() - Check if a cache entry is a placeholder
+ * @e: Cache entry
+ *
+ * Return: True if the entry is a placeholder, false otherwise
+ */
+bool mac_entry_placeholder(const struct mac_cache_entry *e)
+{
+ return mac_undefined(e->mac);
+}
+
+/**
+ * mac_entry_expired() - Check if a cache entry has expired
+ * @e: Cache entry
+ *
+ * Return: True if the entry has expired, false otherwise
+ */
+static bool mac_entry_expired(const struct mac_cache_entry *e)
+{
+ struct timespec now;
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ return timespec_before(&e->expiry, &now);
+}
+
+/**
+ * mac_entry_set_expiry() - Set the time for a cache entry to expire
+ * @e: Cache entry
+ * @expiry: Expiration time, in seconds from current moment.
+ *
+ * Return: The result of the hash
+ */
+static void mac_entry_set_expiry(struct mac_cache_entry *e, int expiry)
+{
+ clock_gettime(CLOCK_MONOTONIC, &e->expiry);
+ e->expiry.tv_sec += expiry;
+}
+
+/**
+ * fwd_mac_cache_find() - Find an entry in the ARP/NDP cache table
+ * @c: Execution context
+ * @key: IPv4 or IPv6 address, used as key for the hash lookup
+ *
+ * Return: Pointer to the entry on success, NULL on failure.
+ */
+static struct mac_cache_entry *fwd_mac_cache_find(const struct ctx *c,
+ const union inany_addr *key)
+{
+ const struct mac_cache_table *t = &mac_cache;
+ struct mac_cache_entry *e;
+ size_t idx;
+
+ idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
+ for (e = t->buckets[idx]; e; e = e->next)
+ if (inany_equals(&e->key, key))
+ return e;
+ return NULL;
+}
+
+/**
+ * fwd_mac_cache_add() - Add a new entry to the ARP/NDP cache table
+ * @c: Execution context
+ * @key: IPv4 or IPv6 address, used as key for the hash lookup
+ * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable
+ *
+ * Return: Pointer to the new entry on success, NULL on failure.
+ */
+static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c,
+ const union inany_addr *key,
+ const unsigned char *mac)
+{
+ struct mac_cache_table *t = &mac_cache;
+ size_t idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
+ struct mac_cache_entry *e;
+
+ e = calloc(1, sizeof(*e));
+ if (!e)
+ return NULL;
+
+ e->key = *key;
+ memcpy(e->mac, mac, ETH_ALEN);
+ e->count = 0;
+ e->next = t->buckets[idx];
+ t->buckets[idx] = e;
+
+ return e;
+}
+
+/**
+ * fwd_neigh_mac_get() - Find a MAC address the ARP/NDP cache table
+ * @c: Execution context
+ * @addr: IPv4 or IPv6 address
+ * @ifi: Interface index
+ * @mac: Buffer for Ethernet MAC to return, found or default.
+ *
+ * Return: true if real MAC found, false if not found or failure
+ */
+bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ uint8_t *mac)
+{
+ struct mac_cache_entry *e = fwd_mac_cache_find(c, addr);
+ int ifi = inany_v4(addr) ? c->ifi4 : c->ifi6;
+ bool refresh = false;
+ bool ret = false;
+
+ if (e)
+ refresh = mac_entry_expired(e);
+ else if ((e = fwd_mac_cache_add(c, addr, mac)))
+ refresh = true;
+ else
+ return false;
+
+ if (!refresh) {
+ ret = !mac_entry_placeholder(e);
+ } else {
+ ret = nl_neigh_mac_get(nl_sock, addr, ifi, e->mac);
+ mac_entry_set_expiry(e, MAC_CACHE_RENEWAL);
+ }
+
+ if (ret) {
+ memcpy(mac, e->mac, ETH_ALEN);
+ return true;
+ }
+
+ /* Do linear back-off of new netlink calls if nothing found */
+ mac_entry_set_expiry(e, e->count++);
+ memcpy(mac, c->our_tap_mac, ETH_ALEN);
+ return false;
+}
+
+/**
+ * fwd_mac_cache_init() - Initiate ARP/NDP cache table
+ *
+ * Return: 0 on success, -1 on failure.
+ */
+int fwd_mac_cache_init(void)
+{
+ struct mac_cache_table *t = &mac_cache;
+
+ t->nbuckets = MAC_CACHE_BUCKETS;
+ t->buckets = calloc(t->nbuckets, sizeof(*t->buckets));
+ t->size = 0;
+ return t->buckets ? 0 : -1;
+}
+
/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
*
* Work out what ports the host thinks are emphemeral and record it for later
diff --git a/fwd.h b/fwd.h
index 65c7c96..80da4b1 100644
--- a/fwd.h
+++ b/fwd.h
@@ -42,6 +42,13 @@ struct fwd_ports {
in_port_t delta[NUM_PORTS];
};
+extern const unsigned char undefined_mac[];
+
+static inline bool mac_undefined(const uint8_t *mac)
+{
+ return !memcmp(mac, undefined_mac, 6);
+}
+
void fwd_scan_ports_tcp(struct fwd_ports *fwd, const struct fwd_ports *rev);
void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
const struct fwd_ports *tcp_fwd,
@@ -57,4 +64,8 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
+bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ uint8_t *mac);
+int fwd_mac_cache_init(void);
+
#endif /* FWD_H */
diff --git a/netlink.c b/netlink.c
index 1ca2c9a..3ba0597 100644
--- a/netlink.c
+++ b/netlink.c
@@ -878,8 +878,6 @@ bool nl_neigh_mac_get(int s, const union inany_addr *addr,
}
}
}
- if (status < 0)
- warn("netlink: RTM_NEWNEIGH failed: %s", strerror_(-status));
return found;
}
--
@@ -878,8 +878,6 @@ bool nl_neigh_mac_get(int s, const union inany_addr *addr,
}
}
}
- if (status < 0)
- warn("netlink: RTM_NEWNEIGH failed: %s", strerror_(-status));
return found;
}
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 02/10] fwd: Added cache table for ARP/NDP contents
2025-09-06 2:11 ` [PATCH v5 02/10] fwd: Added cache table for ARP/NDP contents Jon Maloy
@ 2025-09-08 2:42 ` David Gibson
2025-09-09 15:02 ` Jon Maloy
2025-09-08 9:57 ` David Gibson
1 sibling, 1 reply; 24+ messages in thread
From: David Gibson @ 2025-09-08 2:42 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 11370 bytes --]
On Fri, Sep 05, 2025 at 10:11:46PM -0400, Jon Maloy wrote:
> We add a cache table to keep partial contents of the kernel ARP/NDP
> tables. This way, we drastically reduce the number of netlink calls
> to read those tables.
>
> We create placeholder cache entries representing non- or not-yet-
> existing ARP/NDP entries when needed. We add a short expiration time
> to each such entry, so that we can know when to make repeated calls to
> the kernel tables in the beginning. We also add an access counter to the
> entries, to ensure that the timer becomes longer and the call frequency
> abates over time if no ARP/NDP entry shows up.
>
> For regular entries we use a much longer timer, with the purpose to
> update the entry in the rare case that a remote host changes its
> MAC address.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> ---
> v5: - Moved to earlier in series to reduce rebase conflicts
> ---
> conf.c | 2 +
> fwd.c | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fwd.h | 11 +++
> netlink.c | 2 -
> 4 files changed, 219 insertions(+), 2 deletions(-)
>
> diff --git a/conf.c b/conf.c
> index f47f48e..11d9d63 100644
> --- a/conf.c
> +++ b/conf.c
> @@ -2122,6 +2122,8 @@ void conf(struct ctx *c, int argc, char **argv)
> c->udp.fwd_out.mode = fwd_default;
>
> fwd_scan_ports_init(c);
> + if (fwd_mac_cache_init())
> + die("Failed to initiate neighbour MAC cache");
>
> if (!c->quiet)
> conf_print(c);
> diff --git a/fwd.c b/fwd.c
> index 250cf56..236e58e 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -19,6 +19,8 @@
> #include <sched.h>
> #include <unistd.h>
> #include <stdio.h>
> +#include <time.h>
> +#include <netinet/if_ether.h>
>
> #include "util.h"
> #include "ip.h"
> @@ -26,6 +28,8 @@
> #include "passt.h"
> #include "lineread.h"
> #include "flow_table.h"
> +#include "inany.h"
> +#include "netlink.h"
>
> /* Empheral port range: values from RFC 6335 */
> static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
> @@ -33,6 +37,208 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
>
> #define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
>
> +#define MAC_CACHE_BUCKETS 1024 /* Must be power of two */
> +#define MAC_CACHE_RENEWAL 3600 /* Refresh entry from ARP/NDP every hour */
> +
> +/* Partial cache of ARP/NDP table contents */
> +struct mac_cache_entry {
> + union inany_addr key;
> + unsigned char mac[ETH_ALEN];
> + struct timespec expiry;
> + uint32_t count;
> +
> + /* Hash bucket chain */
> + struct mac_cache_entry *next;
Given we can't allocate (see below), you may be better off with
probing rather than explicit hash chains. That's what we do to look
up the flow table.
Or... we might even be ok with a fixed number of entries per bucket.
The chance of being on a network segment that causes a bucket overflow
might be low enough to ignore that possibility.
> +};
> +
> +struct mac_cache_table {
> + struct mac_cache_entry **buckets;
> + size_t nbuckets;
> + size_t size;
> +};
> +
> +static struct mac_cache_table mac_cache;
> +const uint8_t undefined_mac[6] = {0, 0, 0, 0, 0, 0};
> +
> +/**
> + * fwd_mac_cache_bucket_idx() - Find the table index of an entry
> + * @c: Execution context
> + * @key: IPv4 or IPv6 address, used as key for the hash lookup
> + * @nbuckets: Number of buckets in the table
> + *
> + * Return: The index found
> + */
> +static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c,
> + const union inany_addr *key,
> + size_t nbuckets)
> +{
> + struct siphash_state st = SIPHASH_INIT(c->hash_secret);
> + uint32_t h;
> +
> + inany_siphash_feed(&st, key);
> + h = siphash_final(&st, sizeof(*key), 0);
> +
> + return ((size_t)h) & (nbuckets - 1);
> +}
> +
> +/**
> + * timespec_before() - Check the relation between two pints in time
> + * @a: Point in time to be tested
> + * @b: Point in time test a against
> + * Return: True if a comes before b, otherwise b
> + */
> +static inline bool timespec_before(const struct timespec *a,
> + const struct timespec *b)
This probably belongs in util.c, with timespec_diff_us() and friends.
> +{
> + return (a->tv_sec < b->tv_sec) ||
> + (a->tv_sec == b->tv_sec && a->tv_nsec < b->tv_nsec);
> +}
> +
> +/**
> + * mac_entry_placeholder() - Check if a cache entry is a placeholder
I don't think "placeholder" is a great name for negative cache
entries. It suggests that at some later point it will be replaced
with a real entry, but (at least if we're doing the lookup on first
reply thing) there's no particular reason to expect that to be the
case.
> + * @e: Cache entry
> + *
> + * Return: True if the entry is a placeholder, false otherwise
> + */
> +bool mac_entry_placeholder(const struct mac_cache_entry *e)
> +{
> + return mac_undefined(e->mac);
> +}
> +
> +/**
> + * mac_entry_expired() - Check if a cache entry has expired
> + * @e: Cache entry
> + *
> + * Return: True if the entry has expired, false otherwise
> + */
> +static bool mac_entry_expired(const struct mac_cache_entry *e)
> +{
> + struct timespec now;
> +
> + clock_gettime(CLOCK_MONOTONIC, &now);
Mostly we try to keep to a single clock_gettime() call per epoll
cycle, passing 'now' down to the things we call there.
> + return timespec_before(&e->expiry, &now);
> +}
> +
> +/**
> + * mac_entry_set_expiry() - Set the time for a cache entry to expire
> + * @e: Cache entry
> + * @expiry: Expiration time, in seconds from current moment.
> + *
> + * Return: The result of the hash
Copypasta error?
> + */
> +static void mac_entry_set_expiry(struct mac_cache_entry *e, int expiry)
> +{
> + clock_gettime(CLOCK_MONOTONIC, &e->expiry);
> + e->expiry.tv_sec += expiry;
> +}
> +
> +/**
> + * fwd_mac_cache_find() - Find an entry in the ARP/NDP cache table
> + * @c: Execution context
> + * @key: IPv4 or IPv6 address, used as key for the hash lookup
> + *
> + * Return: Pointer to the entry on success, NULL on failure.
> + */
> +static struct mac_cache_entry *fwd_mac_cache_find(const struct ctx *c,
> + const union inany_addr *key)
> +{
> + const struct mac_cache_table *t = &mac_cache;
> + struct mac_cache_entry *e;
> + size_t idx;
> +
> + idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
> + for (e = t->buckets[idx]; e; e = e->next)
> + if (inany_equals(&e->key, key))
> + return e;
> + return NULL;
> +}
> +
> +/**
> + * fwd_mac_cache_add() - Add a new entry to the ARP/NDP cache table
> + * @c: Execution context
> + * @key: IPv4 or IPv6 address, used as key for the hash lookup
> + * @mac: Buffer for Ethernet MAC, left unchanged if not found/usable
This doesn't seem correct. Also, the only caller passes uninitialized
memory here to fill in later, so there doesn't seem any point to this
parameter.
> + *
> + * Return: Pointer to the new entry on success, NULL on failure.
> + */
> +static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c,
> + const union inany_addr *key,
> + const unsigned char *mac)
> +{
> + struct mac_cache_table *t = &mac_cache;
> + size_t idx = fwd_mac_cache_bucket_idx(c, key, t->nbuckets);
> + struct mac_cache_entry *e;
> +
> + e = calloc(1, sizeof(*e));
No memory allocation allowed in pasta, sorry.
> + if (!e)
> + return NULL;
> +
> + e->key = *key;
> + memcpy(e->mac, mac, ETH_ALEN);
> + e->count = 0;
> + e->next = t->buckets[idx];
> + t->buckets[idx] = e;
> +
> + return e;
> +}
> +
> +/**
> + * fwd_neigh_mac_get() - Find a MAC address the ARP/NDP cache table
> + * @c: Execution context
> + * @addr: IPv4 or IPv6 address
> + * @ifi: Interface index
> + * @mac: Buffer for Ethernet MAC to return, found or default.
> + *
> + * Return: true if real MAC found, false if not found or failure
> + */
> +bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> + uint8_t *mac)
> +{
> + struct mac_cache_entry *e = fwd_mac_cache_find(c, addr);
> + int ifi = inany_v4(addr) ? c->ifi4 : c->ifi6;
> + bool refresh = false;
> + bool ret = false;
> +
> + if (e)
> + refresh = mac_entry_expired(e);
> + else if ((e = fwd_mac_cache_add(c, addr, mac)))
As noted, this is inserting an uninitialized value as mac. We'll
overwrite it, but it seems unnecessarily counterintuitive.
> + refresh = true;
> + else
> + return false;
> +
> + if (!refresh) {
> + ret = !mac_entry_placeholder(e);
> + } else {
> + ret = nl_neigh_mac_get(nl_sock, addr, ifi, e->mac);
If this retursn false (no neighbour found), we never actually
initialise e->mac to 00:00:00:00:00:00 as it looks like was intended.
> + mac_entry_set_expiry(e, MAC_CACHE_RENEWAL);
> + }
> +
> + if (ret) {
> + memcpy(mac, e->mac, ETH_ALEN);
> + return true;
> + }
> +
> + /* Do linear back-off of new netlink calls if nothing found */
> + mac_entry_set_expiry(e, e->count++);
As noted above, I'm not sure there's any real reason to treat expiry
of negative entries differently from positive entries.
> + memcpy(mac, c->our_tap_mac, ETH_ALEN);
> + return false;
> +}
> +
> +/**
> + * fwd_mac_cache_init() - Initiate ARP/NDP cache table
> + *
> + * Return: 0 on success, -1 on failure.
> + */
> +int fwd_mac_cache_init(void)
> +{
> + struct mac_cache_table *t = &mac_cache;
> +
> + t->nbuckets = MAC_CACHE_BUCKETS;
> + t->buckets = calloc(t->nbuckets, sizeof(*t->buckets));
Nope.
> + t->size = 0;
> + return t->buckets ? 0 : -1;
> +}
> +
> /** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
> *
> * Work out what ports the host thinks are emphemeral and record it for later
> diff --git a/fwd.h b/fwd.h
> index 65c7c96..80da4b1 100644
> --- a/fwd.h
> +++ b/fwd.h
> @@ -42,6 +42,13 @@ struct fwd_ports {
> in_port_t delta[NUM_PORTS];
> };
>
> +extern const unsigned char undefined_mac[];
> +
> +static inline bool mac_undefined(const uint8_t *mac)
> +{
> + return !memcmp(mac, undefined_mac, 6);
> +}
> +
> void fwd_scan_ports_tcp(struct fwd_ports *fwd, const struct fwd_ports *rev);
> void fwd_scan_ports_udp(struct fwd_ports *fwd, const struct fwd_ports *rev,
> const struct fwd_ports *tcp_fwd,
> @@ -57,4 +64,8 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
> uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
> const struct flowside *ini, struct flowside *tgt);
>
> +bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> + uint8_t *mac);
> +int fwd_mac_cache_init(void);
> +
> #endif /* FWD_H */
> diff --git a/netlink.c b/netlink.c
> index 1ca2c9a..3ba0597 100644
> --- a/netlink.c
> +++ b/netlink.c
> @@ -878,8 +878,6 @@ bool nl_neigh_mac_get(int s, const union inany_addr *addr,
> }
> }
> }
> - if (status < 0)
> - warn("netlink: RTM_NEWNEIGH failed: %s", strerror_(-status));
Should this change be folded into the first patch?
> return found;
> }
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 02/10] fwd: Added cache table for ARP/NDP contents
2025-09-08 2:42 ` David Gibson
@ 2025-09-09 15:02 ` Jon Maloy
2025-09-10 1:49 ` David Gibson
0 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-09 15:02 UTC (permalink / raw)
To: David Gibson; +Cc: sbrivio, dgibson, passt-dev
On 2025-09-07 22:42, David Gibson wrote:
> On Fri, Sep 05, 2025 at 10:11:46PM -0400, Jon Maloy wrote:
>> We add a cache table to keep partial contents of the kernel ARP/NDP
[...]
>> + return mac_undefined(e->mac);
>> +}
>> +
>> +/**
>> + * mac_entry_expired() - Check if a cache entry has expired
>> + * @e: Cache entry
>> + *
>> + * Return: True if the entry has expired, false otherwise
>> + */
>> +static bool mac_entry_expired(const struct mac_cache_entry *e)
>> +{
>> + struct timespec now;
>> +
>> + clock_gettime(CLOCK_MONOTONIC, &now);
>
> Mostly we try to keep to a single clock_gettime() call per epoll
> cycle, passing 'now' down to the things we call there.
>
>> + return timespec_before(&e->expiry, &now);
>> +}
Passing this value along the whole call chain just for the unlikely case
that we may need it here seems like a bad idea, especially since the
performance gain is minimal.
It would make some sense if we add it to the context struct or even as a
global variable the set it in the main loop, and pass it along that way.
What do you think?
///jon
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 02/10] fwd: Added cache table for ARP/NDP contents
2025-09-09 15:02 ` Jon Maloy
@ 2025-09-10 1:49 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-10 1:49 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 1549 bytes --]
On Tue, Sep 09, 2025 at 11:02:36AM -0400, Jon Maloy wrote:
>
>
> On 2025-09-07 22:42, David Gibson wrote:
> > On Fri, Sep 05, 2025 at 10:11:46PM -0400, Jon Maloy wrote:
> > > We add a cache table to keep partial contents of the kernel ARP/NDP
>
> [...]
>
> > > + return mac_undefined(e->mac);
> > > +}
> > > +
> > > +/**
> > > + * mac_entry_expired() - Check if a cache entry has expired
> > > + * @e: Cache entry
> > > + *
> > > + * Return: True if the entry has expired, false otherwise
> > > + */
> > > +static bool mac_entry_expired(const struct mac_cache_entry *e)
> > > +{
> > > + struct timespec now;
> > > +
> > > + clock_gettime(CLOCK_MONOTONIC, &now);
> >
> > Mostly we try to keep to a single clock_gettime() call per epoll
> > cycle, passing 'now' down to the things we call there.
> >
> > > + return timespec_before(&e->expiry, &now);
> > > +}
>
> Passing this value along the whole call chain just for the unlikely case
> that we may need it here seems like a bad idea, especially since the
> performance gain is minimal.
>
> It would make some sense if we add it to the context struct or even as a
> global variable the set it in the main loop, and pass it along that way.
> What do you think?
Honestly, I prefer calling clock_gettime() again to adding another
global or pseudo-global.
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v5 02/10] fwd: Added cache table for ARP/NDP contents
2025-09-06 2:11 ` [PATCH v5 02/10] fwd: Added cache table for ARP/NDP contents Jon Maloy
2025-09-08 2:42 ` David Gibson
@ 2025-09-08 9:57 ` David Gibson
1 sibling, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 9:57 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 1286 bytes --]
On Fri, Sep 05, 2025 at 10:11:46PM -0400, Jon Maloy wrote:
> We add a cache table to keep partial contents of the kernel ARP/NDP
> tables. This way, we drastically reduce the number of netlink calls
> to read those tables.
>
> We create placeholder cache entries representing non- or not-yet-
> existing ARP/NDP entries when needed. We add a short expiration time
> to each such entry, so that we can know when to make repeated calls to
> the kernel tables in the beginning. We also add an access counter to the
> entries, to ensure that the timer becomes longer and the call frequency
> abates over time if no ARP/NDP entry shows up.
>
> For regular entries we use a much longer timer, with the purpose to
> update the entry in the rare case that a remote host changes its
> MAC address.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Sorry, another patch reminded me of something I missed on the first pass:
[snip]
> +static struct mac_cache_table mac_cache;
> +const uint8_t undefined_mac[6] = {0, 0, 0, 0, 0, 0};
We already have MAC_ZERO in util.h.
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 03/10] fwd: Add entries of ARP/NDP cache table to a FIFO/LRU queue
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
2025-09-06 2:11 ` [PATCH v5 01/10] netlink: add function to extract MAC addresses from NDP/ARP table Jon Maloy
2025-09-06 2:11 ` [PATCH v5 02/10] fwd: Added cache table for ARP/NDP contents Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 2:51 ` David Gibson
2025-09-06 2:11 ` [PATCH v5 04/10] arp/ndp: respond with true MAC address of LAN local remote hosts Jon Maloy
` (6 subsequent siblings)
9 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
Because we are going to do ARP/NDP lookup of many non-local
addresses the table might eventually fill up with placeholder
entries which will never be completed. We therefore need an
aging mechanism based on access frequency, so that we can
evict the least used entries when a new one is created when
the table is full.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v5: - New
---
fwd.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 92 insertions(+)
diff --git a/fwd.c b/fwd.c
index 236e58e..ab49dba 100644
--- a/fwd.c
+++ b/fwd.c
@@ -46,6 +46,8 @@ struct mac_cache_entry {
unsigned char mac[ETH_ALEN];
struct timespec expiry;
uint32_t count;
+ struct mac_cache_entry *qprev;
+ struct mac_cache_entry *qnext;
/* Hash bucket chain */
struct mac_cache_entry *next;
@@ -55,6 +57,10 @@ struct mac_cache_table {
struct mac_cache_entry **buckets;
size_t nbuckets;
size_t size;
+
+ /* FIFO/LRU queue */
+ struct mac_cache_entry *qhead;
+ struct mac_cache_entry *qtail;
};
static struct mac_cache_table mac_cache;
@@ -81,6 +87,82 @@ static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c,
return ((size_t)h) & (nbuckets - 1);
}
+/**
+ * fwd_mac_cacheq_append_tail() - Unlink entry from LRU queue
+ * @c: Execution context
+ */
+static void fwd_mac_cacheq_append_tail(struct mac_cache_entry *e)
+{
+ struct mac_cache_table *t = &mac_cache;
+
+ e->qnext = NULL;
+ e->qprev = t->qtail;
+ if (t->qtail)
+ t->qtail->qnext = e;
+ else
+ t->qhead = e;
+ t->qtail = e;
+}
+
+/**
+ * fwd_mac_cacheq_unlink() - Unlink entry from LRU queue
+ * @c: Execution context
+ */
+static void fwd_mac_cacheq_unlink(struct mac_cache_entry *e)
+{
+ struct mac_cache_table *t = &mac_cache;
+
+ if (e->qprev)
+ e->qprev->qnext = e->qnext;
+ else
+ t->qhead = e->qnext;
+ if (e->qnext)
+ e->qnext->qprev = e->qprev;
+ else
+ t->qtail = e->qprev;
+ e->qprev = e->qnext = NULL;
+}
+
+/**
+ * fwd_mac_cacheq_move_to_tail() - Move entry to tail of LRU queue
+ * @c: Execution context
+ */
+static void fwd_mac_cacheq_move_to_tail(struct mac_cache_entry *e)
+{
+ struct mac_cache_table *t = &mac_cache;
+
+ if (t->qtail == e)
+ return;
+
+ fwd_mac_cacheq_unlink(e);
+ fwd_mac_cacheq_append_tail(e);
+}
+
+/**
+ * fwd_mac_cache_evict_one() - Remove entry from head of LRU queue
+ * @c: Execution context
+ */
+static void fwd_mac_cache_evict_one(const struct ctx *c)
+{
+ struct mac_cache_entry **pp, *e, *cursor;
+ struct mac_cache_table *t = &mac_cache;
+ size_t idx;
+
+ e = t->qhead;
+ if (!e)
+ return;
+
+ idx = fwd_mac_cache_bucket_idx(c, &e->key, t->nbuckets);
+ for (pp = &t->buckets[idx]; ((cursor = *pp)); pp = &cursor->next) {
+ if (cursor != e)
+ continue;
+ *pp = cursor->next;
+ break;
+ }
+ free(e);
+ t->size--;
+}
+
/**
* timespec_before() - Check the relation between two pints in time
* @a: Point in time to be tested
@@ -179,6 +261,11 @@ static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c,
e->next = t->buckets[idx];
t->buckets[idx] = e;
+ /* If needed, delete least recently used entry before adding new one */
+ if (t->size == MAC_CACHE_BUCKETS)
+ fwd_mac_cache_evict_one(c);
+ fwd_mac_cacheq_append_tail(e);
+
return e;
}
@@ -213,6 +300,9 @@ bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
mac_entry_set_expiry(e, MAC_CACHE_RENEWAL);
}
+ /* Actively used entries must not be evicted from cache */
+ fwd_mac_cacheq_move_to_tail(e);
+
if (ret) {
memcpy(mac, e->mac, ETH_ALEN);
return true;
@@ -235,7 +325,9 @@ int fwd_mac_cache_init(void)
t->nbuckets = MAC_CACHE_BUCKETS;
t->buckets = calloc(t->nbuckets, sizeof(*t->buckets));
+ t->qhead = t->qtail = NULL;
t->size = 0;
+
return t->buckets ? 0 : -1;
}
--
@@ -46,6 +46,8 @@ struct mac_cache_entry {
unsigned char mac[ETH_ALEN];
struct timespec expiry;
uint32_t count;
+ struct mac_cache_entry *qprev;
+ struct mac_cache_entry *qnext;
/* Hash bucket chain */
struct mac_cache_entry *next;
@@ -55,6 +57,10 @@ struct mac_cache_table {
struct mac_cache_entry **buckets;
size_t nbuckets;
size_t size;
+
+ /* FIFO/LRU queue */
+ struct mac_cache_entry *qhead;
+ struct mac_cache_entry *qtail;
};
static struct mac_cache_table mac_cache;
@@ -81,6 +87,82 @@ static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c,
return ((size_t)h) & (nbuckets - 1);
}
+/**
+ * fwd_mac_cacheq_append_tail() - Unlink entry from LRU queue
+ * @c: Execution context
+ */
+static void fwd_mac_cacheq_append_tail(struct mac_cache_entry *e)
+{
+ struct mac_cache_table *t = &mac_cache;
+
+ e->qnext = NULL;
+ e->qprev = t->qtail;
+ if (t->qtail)
+ t->qtail->qnext = e;
+ else
+ t->qhead = e;
+ t->qtail = e;
+}
+
+/**
+ * fwd_mac_cacheq_unlink() - Unlink entry from LRU queue
+ * @c: Execution context
+ */
+static void fwd_mac_cacheq_unlink(struct mac_cache_entry *e)
+{
+ struct mac_cache_table *t = &mac_cache;
+
+ if (e->qprev)
+ e->qprev->qnext = e->qnext;
+ else
+ t->qhead = e->qnext;
+ if (e->qnext)
+ e->qnext->qprev = e->qprev;
+ else
+ t->qtail = e->qprev;
+ e->qprev = e->qnext = NULL;
+}
+
+/**
+ * fwd_mac_cacheq_move_to_tail() - Move entry to tail of LRU queue
+ * @c: Execution context
+ */
+static void fwd_mac_cacheq_move_to_tail(struct mac_cache_entry *e)
+{
+ struct mac_cache_table *t = &mac_cache;
+
+ if (t->qtail == e)
+ return;
+
+ fwd_mac_cacheq_unlink(e);
+ fwd_mac_cacheq_append_tail(e);
+}
+
+/**
+ * fwd_mac_cache_evict_one() - Remove entry from head of LRU queue
+ * @c: Execution context
+ */
+static void fwd_mac_cache_evict_one(const struct ctx *c)
+{
+ struct mac_cache_entry **pp, *e, *cursor;
+ struct mac_cache_table *t = &mac_cache;
+ size_t idx;
+
+ e = t->qhead;
+ if (!e)
+ return;
+
+ idx = fwd_mac_cache_bucket_idx(c, &e->key, t->nbuckets);
+ for (pp = &t->buckets[idx]; ((cursor = *pp)); pp = &cursor->next) {
+ if (cursor != e)
+ continue;
+ *pp = cursor->next;
+ break;
+ }
+ free(e);
+ t->size--;
+}
+
/**
* timespec_before() - Check the relation between two pints in time
* @a: Point in time to be tested
@@ -179,6 +261,11 @@ static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c,
e->next = t->buckets[idx];
t->buckets[idx] = e;
+ /* If needed, delete least recently used entry before adding new one */
+ if (t->size == MAC_CACHE_BUCKETS)
+ fwd_mac_cache_evict_one(c);
+ fwd_mac_cacheq_append_tail(e);
+
return e;
}
@@ -213,6 +300,9 @@ bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
mac_entry_set_expiry(e, MAC_CACHE_RENEWAL);
}
+ /* Actively used entries must not be evicted from cache */
+ fwd_mac_cacheq_move_to_tail(e);
+
if (ret) {
memcpy(mac, e->mac, ETH_ALEN);
return true;
@@ -235,7 +325,9 @@ int fwd_mac_cache_init(void)
t->nbuckets = MAC_CACHE_BUCKETS;
t->buckets = calloc(t->nbuckets, sizeof(*t->buckets));
+ t->qhead = t->qtail = NULL;
t->size = 0;
+
return t->buckets ? 0 : -1;
}
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 03/10] fwd: Add entries of ARP/NDP cache table to a FIFO/LRU queue
2025-09-06 2:11 ` [PATCH v5 03/10] fwd: Add entries of ARP/NDP cache table to a FIFO/LRU queue Jon Maloy
@ 2025-09-08 2:51 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 2:51 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 4709 bytes --]
On Fri, Sep 05, 2025 at 10:11:47PM -0400, Jon Maloy wrote:
> Because we are going to do ARP/NDP lookup of many non-local
> addresses the table might eventually fill up with placeholder
> entries which will never be completed. We therefore need an
> aging mechanism based on access frequency, so that we can
> evict the least used entries when a new one is created when
> the table is full.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> ---
> v5: - New
> ---
> fwd.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 92 insertions(+)
>
> diff --git a/fwd.c b/fwd.c
> index 236e58e..ab49dba 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -46,6 +46,8 @@ struct mac_cache_entry {
> unsigned char mac[ETH_ALEN];
> struct timespec expiry;
> uint32_t count;
> + struct mac_cache_entry *qprev;
> + struct mac_cache_entry *qnext;
>
> /* Hash bucket chain */
> struct mac_cache_entry *next;
> @@ -55,6 +57,10 @@ struct mac_cache_table {
> struct mac_cache_entry **buckets;
> size_t nbuckets;
> size_t size;
> +
> + /* FIFO/LRU queue */
> + struct mac_cache_entry *qhead;
> + struct mac_cache_entry *qtail;
> };
This will need reworking for an allocation-free table.
I feel like there should be a simpler way to do this than a full
doubly linked pointer list, but it's not immediately obvious to me.
>
> static struct mac_cache_table mac_cache;
> @@ -81,6 +87,82 @@ static inline size_t fwd_mac_cache_bucket_idx(const struct ctx *c,
> return ((size_t)h) & (nbuckets - 1);
> }
>
> +/**
> + * fwd_mac_cacheq_append_tail() - Unlink entry from LRU queue
> + * @c: Execution context
> + */
> +static void fwd_mac_cacheq_append_tail(struct mac_cache_entry *e)
> +{
> + struct mac_cache_table *t = &mac_cache;
> +
> + e->qnext = NULL;
> + e->qprev = t->qtail;
> + if (t->qtail)
> + t->qtail->qnext = e;
> + else
> + t->qhead = e;
> + t->qtail = e;
> +}
> +
> +/**
> + * fwd_mac_cacheq_unlink() - Unlink entry from LRU queue
> + * @c: Execution context
> + */
> +static void fwd_mac_cacheq_unlink(struct mac_cache_entry *e)
> +{
> + struct mac_cache_table *t = &mac_cache;
> +
> + if (e->qprev)
> + e->qprev->qnext = e->qnext;
> + else
> + t->qhead = e->qnext;
> + if (e->qnext)
> + e->qnext->qprev = e->qprev;
> + else
> + t->qtail = e->qprev;
> + e->qprev = e->qnext = NULL;
> +}
> +
> +/**
> + * fwd_mac_cacheq_move_to_tail() - Move entry to tail of LRU queue
> + * @c: Execution context
> + */
> +static void fwd_mac_cacheq_move_to_tail(struct mac_cache_entry *e)
> +{
> + struct mac_cache_table *t = &mac_cache;
> +
> + if (t->qtail == e)
> + return;
> +
> + fwd_mac_cacheq_unlink(e);
> + fwd_mac_cacheq_append_tail(e);
> +}
> +
> +/**
> + * fwd_mac_cache_evict_one() - Remove entry from head of LRU queue
> + * @c: Execution context
> + */
> +static void fwd_mac_cache_evict_one(const struct ctx *c)
> +{
> + struct mac_cache_entry **pp, *e, *cursor;
> + struct mac_cache_table *t = &mac_cache;
> + size_t idx;
> +
> + e = t->qhead;
> + if (!e)
> + return;
> +
> + idx = fwd_mac_cache_bucket_idx(c, &e->key, t->nbuckets);
> + for (pp = &t->buckets[idx]; ((cursor = *pp)); pp = &cursor->next) {
> + if (cursor != e)
> + continue;
> + *pp = cursor->next;
> + break;
> + }
> + free(e);
> + t->size--;
> +}
> +
> /**
> * timespec_before() - Check the relation between two pints in time
> * @a: Point in time to be tested
> @@ -179,6 +261,11 @@ static struct mac_cache_entry *fwd_mac_cache_add(const struct ctx *c,
> e->next = t->buckets[idx];
> t->buckets[idx] = e;
>
> + /* If needed, delete least recently used entry before adding new one */
> + if (t->size == MAC_CACHE_BUCKETS)
> + fwd_mac_cache_evict_one(c);
> + fwd_mac_cacheq_append_tail(e);
> +
> return e;
> }
>
> @@ -213,6 +300,9 @@ bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> mac_entry_set_expiry(e, MAC_CACHE_RENEWAL);
> }
>
> + /* Actively used entries must not be evicted from cache */
> + fwd_mac_cacheq_move_to_tail(e);
> +
> if (ret) {
> memcpy(mac, e->mac, ETH_ALEN);
> return true;
> @@ -235,7 +325,9 @@ int fwd_mac_cache_init(void)
>
> t->nbuckets = MAC_CACHE_BUCKETS;
> t->buckets = calloc(t->nbuckets, sizeof(*t->buckets));
> + t->qhead = t->qtail = NULL;
> t->size = 0;
> +
> return t->buckets ? 0 : -1;
> }
>
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 04/10] arp/ndp: respond with true MAC address of LAN local remote hosts
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
` (2 preceding siblings ...)
2025-09-06 2:11 ` [PATCH v5 03/10] fwd: Add entries of ARP/NDP cache table to a FIFO/LRU queue Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 3:04 ` David Gibson
2025-09-06 2:11 ` [PATCH v5 05/10] flow: add MAC address of LAN local remote hosts to flow Jon Maloy
` (5 subsequent siblings)
9 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
When we receive an ARP request or NDP neigbor solicitation over
the tap interface for a host on the local network segment attached
to the template interface, we respond with that host's real MAC
address.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v3: - Added helper function to find out if a remote ip address is subject
to NAT. This filters out local host addresses which should be
presented with the passt/pasta local MAC address 9a:55:9a:55:9a:55 even
though it is on the local segment.
- Adapted to the change in nl_mac_get() function, so that we now consider
only the template interface when checking the ARP/NDP table.
v4: - Moved NAT check into the function nat_outbound() to obtain more
precise criteria for when NAT is used. We may in theory
have NAT even if original and translated addresses are equal, and
we want to catch this case.
- I chose to keep the wrapper funtion inany_nat(), but moved it to
fwd.h/fwd.c and renamed it to fwd_inany_nat().
v5: - Simplified criteria for when we do ARP/NDP lookup. Now, we
just try with the potentially translated address after an
attempted NAT check.
- Using the new ARP/NDP cache table instead of using netlink
directly.
---
arp.c | 9 ++++++++-
fwd.c | 5 +++--
fwd.h | 2 ++
inany.c | 1 +
ndp.c | 8 ++++++++
5 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/arp.c b/arp.c
index 44677ad..8bff505 100644
--- a/arp.c
+++ b/arp.c
@@ -69,6 +69,7 @@ static bool ignore_arp(const struct ctx *c,
*/
int arp(const struct ctx *c, struct iov_tail *data)
{
+ union inany_addr tgt, tgt_nat;
struct {
struct ethhdr eh;
struct arphdr ah;
@@ -80,6 +81,7 @@ int arp(const struct ctx *c, struct iov_tail *data)
const struct ethhdr *eh;
const struct arphdr *ah;
const struct arpmsg *am;
+ uint8_t omac[ETH_ALEN];
eh = IOV_REMOVE_HEADER(data, eh_storage);
ah = IOV_REMOVE_HEADER(data, ah_storage);
@@ -102,8 +104,13 @@ int arp(const struct ctx *c, struct iov_tail *data)
resp.ah.ar_hln = ah->ar_hln;
resp.ah.ar_pln = ah->ar_pln;
+ /* MAC address to return */
+ inany_from_af(&tgt, AF_INET, am->tip);
+ nat_outbound(c, &tgt, &tgt_nat);
+ fwd_neigh_mac_get(c, &tgt_nat, omac);
+
/* ARP message */
- memcpy(resp.am.sha, c->our_tap_mac, sizeof(resp.am.sha));
+ memcpy(resp.am.sha, omac, sizeof(resp.am.sha));
memcpy(resp.am.sip, am->tip, sizeof(resp.am.sip));
memcpy(resp.am.tha, am->sha, sizeof(resp.am.tha));
memcpy(resp.am.tip, am->sip, sizeof(resp.am.tip));
diff --git a/fwd.c b/fwd.c
index ab49dba..664e167 100644
--- a/fwd.c
+++ b/fwd.c
@@ -630,8 +630,8 @@ static bool fwd_guest_accessible(const struct ctx *c,
* Only handles translations that depend *only* on the address. Anything
* related to specific ports or flows is handled elsewhere.
*/
-static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
- union inany_addr *translated)
+void nat_outbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated)
{
if (inany_equals4(addr, &c->ip4.map_host_loopback))
*translated = inany_loopback4;
@@ -643,6 +643,7 @@ static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
translated->a6 = c->ip6.addr;
else
*translated = *addr;
+
}
/**
diff --git a/fwd.h b/fwd.h
index 80da4b1..bf1705b 100644
--- a/fwd.h
+++ b/fwd.h
@@ -57,6 +57,8 @@ void fwd_scan_ports_init(struct ctx *c);
bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
union inany_addr *translated);
+void nat_outbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated);
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
diff --git a/inany.c b/inany.c
index 65a39f9..7680439 100644
--- a/inany.c
+++ b/inany.c
@@ -16,6 +16,7 @@
#include "ip.h"
#include "siphash.h"
#include "inany.h"
+#include "fwd.h"
const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT);
const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
diff --git a/ndp.c b/ndp.c
index eb090cd..820c556 100644
--- a/ndp.c
+++ b/ndp.c
@@ -196,6 +196,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
const struct in6_addr *addr)
{
+ union inany_addr tgt, tgt_nat;
struct ndp_na na = {
.ih = {
.icmp6_type = NA,
@@ -215,6 +216,13 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
+ /* Respond with true MAC address if remote host's address or
+ * NAT translated address can be found in NDP table.
+ */
+ inany_from_af(&tgt, AF_INET6, addr);
+ nat_outbound(c, &tgt, &tgt_nat);
+ fwd_neigh_mac_get(c, &tgt_nat, na.target_l2_addr.mac);
+
ndp_send(c, dst, &na, sizeof(na));
}
--
@@ -196,6 +196,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
const struct in6_addr *addr)
{
+ union inany_addr tgt, tgt_nat;
struct ndp_na na = {
.ih = {
.icmp6_type = NA,
@@ -215,6 +216,13 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
+ /* Respond with true MAC address if remote host's address or
+ * NAT translated address can be found in NDP table.
+ */
+ inany_from_af(&tgt, AF_INET6, addr);
+ nat_outbound(c, &tgt, &tgt_nat);
+ fwd_neigh_mac_get(c, &tgt_nat, na.target_l2_addr.mac);
+
ndp_send(c, dst, &na, sizeof(na));
}
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 04/10] arp/ndp: respond with true MAC address of LAN local remote hosts
2025-09-06 2:11 ` [PATCH v5 04/10] arp/ndp: respond with true MAC address of LAN local remote hosts Jon Maloy
@ 2025-09-08 3:04 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 3:04 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 5979 bytes --]
On Fri, Sep 05, 2025 at 10:11:48PM -0400, Jon Maloy wrote:
> When we receive an ARP request or NDP neigbor solicitation over
> the tap interface for a host on the local network segment attached
> to the template interface, we respond with that host's real MAC
> address.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
LGTM, apart from small nits.
>
> ---
> v3: - Added helper function to find out if a remote ip address is subject
> to NAT. This filters out local host addresses which should be
> presented with the passt/pasta local MAC address 9a:55:9a:55:9a:55 even
> though it is on the local segment.
> - Adapted to the change in nl_mac_get() function, so that we now consider
> only the template interface when checking the ARP/NDP table.
> v4: - Moved NAT check into the function nat_outbound() to obtain more
> precise criteria for when NAT is used. We may in theory
> have NAT even if original and translated addresses are equal, and
> we want to catch this case.
> - I chose to keep the wrapper funtion inany_nat(), but moved it to
> fwd.h/fwd.c and renamed it to fwd_inany_nat().
> v5: - Simplified criteria for when we do ARP/NDP lookup. Now, we
> just try with the potentially translated address after an
> attempted NAT check.
> - Using the new ARP/NDP cache table instead of using netlink
> directly.
> ---
> arp.c | 9 ++++++++-
> fwd.c | 5 +++--
> fwd.h | 2 ++
> inany.c | 1 +
> ndp.c | 8 ++++++++
> 5 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/arp.c b/arp.c
> index 44677ad..8bff505 100644
> --- a/arp.c
> +++ b/arp.c
> @@ -69,6 +69,7 @@ static bool ignore_arp(const struct ctx *c,
> */
> int arp(const struct ctx *c, struct iov_tail *data)
> {
> + union inany_addr tgt, tgt_nat;
> struct {
> struct ethhdr eh;
> struct arphdr ah;
> @@ -80,6 +81,7 @@ int arp(const struct ctx *c, struct iov_tail *data)
> const struct ethhdr *eh;
> const struct arphdr *ah;
> const struct arpmsg *am;
> + uint8_t omac[ETH_ALEN];
Maybe tap_omac to disambiguate. Or else put it straight into
resp.am.sha, similar to how you do for IPv6.
> eh = IOV_REMOVE_HEADER(data, eh_storage);
> ah = IOV_REMOVE_HEADER(data, ah_storage);
> @@ -102,8 +104,13 @@ int arp(const struct ctx *c, struct iov_tail *data)
> resp.ah.ar_hln = ah->ar_hln;
> resp.ah.ar_pln = ah->ar_pln;
>
> + /* MAC address to return */
> + inany_from_af(&tgt, AF_INET, am->tip);
> + nat_outbound(c, &tgt, &tgt_nat);
> + fwd_neigh_mac_get(c, &tgt_nat, omac);
> +
> /* ARP message */
> - memcpy(resp.am.sha, c->our_tap_mac, sizeof(resp.am.sha));
> + memcpy(resp.am.sha, omac, sizeof(resp.am.sha));
> memcpy(resp.am.sip, am->tip, sizeof(resp.am.sip));
> memcpy(resp.am.tha, am->sha, sizeof(resp.am.tha));
> memcpy(resp.am.tip, am->sip, sizeof(resp.am.tip));
> diff --git a/fwd.c b/fwd.c
> index ab49dba..664e167 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -630,8 +630,8 @@ static bool fwd_guest_accessible(const struct ctx *c,
> * Only handles translations that depend *only* on the address. Anything
> * related to specific ports or flows is handled elsewhere.
> */
> -static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
> - union inany_addr *translated)
> +void nat_outbound(const struct ctx *c, const union inany_addr *addr,
> + union inany_addr *translated)
Since this is now exported, it should become fwd_nat_outbound() to
namespace it properly.
> {
> if (inany_equals4(addr, &c->ip4.map_host_loopback))
> *translated = inany_loopback4;
> @@ -643,6 +643,7 @@ static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
> translated->a6 = c->ip6.addr;
> else
> *translated = *addr;
> +
Spurious whitespace.
> }
>
> /**
> diff --git a/fwd.h b/fwd.h
> index 80da4b1..bf1705b 100644
> --- a/fwd.h
> +++ b/fwd.h
> @@ -57,6 +57,8 @@ void fwd_scan_ports_init(struct ctx *c);
>
> bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
> union inany_addr *translated);
> +void nat_outbound(const struct ctx *c, const union inany_addr *addr,
> + union inany_addr *translated);
> uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
> const struct flowside *ini, struct flowside *tgt);
> uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
> diff --git a/inany.c b/inany.c
> index 65a39f9..7680439 100644
> --- a/inany.c
> +++ b/inany.c
> @@ -16,6 +16,7 @@
> #include "ip.h"
> #include "siphash.h"
> #include "inany.h"
> +#include "fwd.h"
>
> const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT);
> const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
> diff --git a/ndp.c b/ndp.c
> index eb090cd..820c556 100644
> --- a/ndp.c
> +++ b/ndp.c
> @@ -196,6 +196,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
> static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
> const struct in6_addr *addr)
> {
> + union inany_addr tgt, tgt_nat;
> struct ndp_na na = {
> .ih = {
> .icmp6_type = NA,
> @@ -215,6 +216,13 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
>
> memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
>
> + /* Respond with true MAC address if remote host's address or
> + * NAT translated address can be found in NDP table.
> + */
> + inany_from_af(&tgt, AF_INET6, addr);
> + nat_outbound(c, &tgt, &tgt_nat);
> + fwd_neigh_mac_get(c, &tgt_nat, na.target_l2_addr.mac);
> +
> ndp_send(c, dst, &na, sizeof(na));
> }
>
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 05/10] flow: add MAC address of LAN local remote hosts to flow
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
` (3 preceding siblings ...)
2025-09-06 2:11 ` [PATCH v5 04/10] arp/ndp: respond with true MAC address of LAN local remote hosts Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 3:07 ` David Gibson
2025-09-06 2:11 ` [PATCH v5 06/10] udp: forward external source MAC address through tap interface Jon Maloy
` (4 subsequent siblings)
9 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
When communicating with remote hosts on the local network, some guest
applications want to see the real MAC address of that host instead
of PASST/PASTA's own tap address. The flow_common structure is a
convenient location for storing that address, so we do that in this
commit.
Note that we don´t add actual usage of this address here, that will
be done in later commits.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v3: - Moved the remote host macaddress from struct flowside to
struct flow_common. I chose to call it 'omac' as suggested
by David, although in my understanding the correct name would be
'emac'. (In general I find the address naming scheme confusing.)
- Adapted to new signature of function nl_mac_get(), now passing
it the index of the template interface.
v4: - Renamed flow_commeon->omac to flow_common->tap_omac to make is
role in the code clearer
v5: - Modified the criteria for ARP/NDP table lookup like in the
previous commits.
- Removed the PIF_TAP lookup case, as David suggested, and did
instead give the flow->tap_omac field a value marking it as
non-initialized.
- Calling the cache table instead of netlink for ARP/NDP lookup.
- Unconditionally using the potentially translated IP address
in the lookup, instead of only if NAT really was applied.
---
flow.c | 2 ++
flow.h | 2 ++
2 files changed, 4 insertions(+)
diff --git a/flow.c b/flow.c
index feefda3..afef916 100644
--- a/flow.c
+++ b/flow.c
@@ -449,6 +449,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
switch (f->pif[INISIDE]) {
case PIF_TAP:
+ memcpy(f->tap_omac, undefined_mac, sizeof(f->tap_omac));
tgtpif = fwd_nat_from_tap(c, proto, ini, tgt);
break;
@@ -458,6 +459,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
case PIF_HOST:
tgtpif = fwd_nat_from_host(c, proto, ini, tgt);
+ fwd_neigh_mac_get(c, &ini->eaddr, f->tap_omac);
break;
default:
diff --git a/flow.h b/flow.h
index cac618a..f342895 100644
--- a/flow.h
+++ b/flow.h
@@ -177,6 +177,7 @@ int flowside_connect(const struct ctx *c, int s,
* @type: Type of packet flow
* @pif[]: Interface for each side of the flow
* @side[]: Information for each side of the flow
+ * @tap_omac: MAC address of remote endpoint as seen from the guest
*/
struct flow_common {
#ifdef __GNUC__
@@ -192,6 +193,7 @@ struct flow_common {
#endif
uint8_t pif[SIDES];
struct flowside side[SIDES];
+ uint8_t tap_omac[6];
};
#define FLOW_INDEX_BITS 17 /* 128k - 1 */
--
@@ -177,6 +177,7 @@ int flowside_connect(const struct ctx *c, int s,
* @type: Type of packet flow
* @pif[]: Interface for each side of the flow
* @side[]: Information for each side of the flow
+ * @tap_omac: MAC address of remote endpoint as seen from the guest
*/
struct flow_common {
#ifdef __GNUC__
@@ -192,6 +193,7 @@ struct flow_common {
#endif
uint8_t pif[SIDES];
struct flowside side[SIDES];
+ uint8_t tap_omac[6];
};
#define FLOW_INDEX_BITS 17 /* 128k - 1 */
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 05/10] flow: add MAC address of LAN local remote hosts to flow
2025-09-06 2:11 ` [PATCH v5 05/10] flow: add MAC address of LAN local remote hosts to flow Jon Maloy
@ 2025-09-08 3:07 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 3:07 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 3203 bytes --]
On Fri, Sep 05, 2025 at 10:11:49PM -0400, Jon Maloy wrote:
> When communicating with remote hosts on the local network, some guest
> applications want to see the real MAC address of that host instead
> of PASST/PASTA's own tap address. The flow_common structure is a
> convenient location for storing that address, so we do that in this
> commit.
>
> Note that we don´t add actual usage of this address here, that will
> be done in later commits.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> ---
> v3: - Moved the remote host macaddress from struct flowside to
> struct flow_common. I chose to call it 'omac' as suggested
> by David, although in my understanding the correct name would be
> 'emac'. (In general I find the address naming scheme confusing.)
> - Adapted to new signature of function nl_mac_get(), now passing
> it the index of the template interface.
> v4: - Renamed flow_commeon->omac to flow_common->tap_omac to make is
> role in the code clearer
> v5: - Modified the criteria for ARP/NDP table lookup like in the
> previous commits.
> - Removed the PIF_TAP lookup case, as David suggested, and did
> instead give the flow->tap_omac field a value marking it as
> non-initialized.
> - Calling the cache table instead of netlink for ARP/NDP lookup.
> - Unconditionally using the potentially translated IP address
> in the lookup, instead of only if NAT really was applied.
Although it looks like that last point belongs on a different patch.
> ---
> flow.c | 2 ++
> flow.h | 2 ++
> 2 files changed, 4 insertions(+)
>
> diff --git a/flow.c b/flow.c
> index feefda3..afef916 100644
> --- a/flow.c
> +++ b/flow.c
> @@ -449,6 +449,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
>
> switch (f->pif[INISIDE]) {
> case PIF_TAP:
> + memcpy(f->tap_omac, undefined_mac, sizeof(f->tap_omac));
> tgtpif = fwd_nat_from_tap(c, proto, ini, tgt);
> break;
>
> @@ -458,6 +459,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
>
> case PIF_HOST:
> tgtpif = fwd_nat_from_host(c, proto, ini, tgt);
> + fwd_neigh_mac_get(c, &ini->eaddr, f->tap_omac);
> break;
>
> default:
> diff --git a/flow.h b/flow.h
> index cac618a..f342895 100644
> --- a/flow.h
> +++ b/flow.h
> @@ -177,6 +177,7 @@ int flowside_connect(const struct ctx *c, int s,
> * @type: Type of packet flow
> * @pif[]: Interface for each side of the flow
> * @side[]: Information for each side of the flow
> + * @tap_omac: MAC address of remote endpoint as seen from the guest
> */
> struct flow_common {
> #ifdef __GNUC__
> @@ -192,6 +193,7 @@ struct flow_common {
> #endif
> uint8_t pif[SIDES];
> struct flowside side[SIDES];
> + uint8_t tap_omac[6];
> };
>
> #define FLOW_INDEX_BITS 17 /* 128k - 1 */
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 06/10] udp: forward external source MAC address through tap interface
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
` (4 preceding siblings ...)
2025-09-06 2:11 ` [PATCH v5 05/10] flow: add MAC address of LAN local remote hosts to flow Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 3:13 ` David Gibson
2025-09-06 2:11 ` [PATCH v5 07/10] tcp: " Jon Maloy
` (3 subsequent siblings)
9 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
We forward the incoming MAC address through the tap interface when
receiving incoming packets from network local hosts.
This is a part of the solution to bug
https://bugs.passt.top/show_bug.cgi?id=120
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v3: - Adapted to the move of external MAC address from struct flowside
to struct flow_common
v4: - Changed signature of udp_tap_prepare() to take a MAC address
instead of a flow.
- Eliminated initialization of MAC source address in all frames,
since those now are set per send occasion anyway.
v5: - Added lookup in ARP/NDP table on incoming messages in
case flow->tap_omac wasn't initialized at flow creation,
i.e., the flow was initiated from the guest.
---
passt.c | 2 +-
udp.c | 45 +++++++++++++++++++++++++--------------------
udp.h | 2 +-
3 files changed, 27 insertions(+), 22 deletions(-)
diff --git a/passt.c b/passt.c
index a4ec115..2a28e20 100644
--- a/passt.c
+++ b/passt.c
@@ -154,7 +154,7 @@ static void timer_init(struct ctx *c, const struct timespec *now)
void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
{
tcp_update_l2_buf(eth_d, eth_s);
- udp_update_l2_buf(eth_d, eth_s);
+ udp_update_l2_buf(eth_d);
}
/**
diff --git a/udp.c b/udp.c
index 86585b7..7d98845 100644
--- a/udp.c
+++ b/udp.c
@@ -133,11 +133,8 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
/* UDP header and data for inbound messages */
static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
-/* Ethernet header for IPv4 frames */
-static struct ethhdr udp4_eth_hdr;
-
-/* Ethernet header for IPv6 frames */
-static struct ethhdr udp6_eth_hdr;
+/* Ethernet headers for IPv4 and IPv6 frames */
+static struct ethhdr udp_eth_hdr[UDP_MAX_FRAMES];
/**
* struct udp_meta_t - Pre-cooked headers for UDP packets
@@ -210,12 +207,13 @@ void udp_portmap_clear(void)
/**
* udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void udp_update_l2_buf(const unsigned char *eth_d)
{
- eth_update_mac(&udp4_eth_hdr, eth_d, eth_s);
- eth_update_mac(&udp6_eth_hdr, eth_d, eth_s);
+ int i;
+
+ for (i = 0; i < UDP_MAX_FRAMES; i++)
+ eth_update_mac(&udp_eth_hdr[i], eth_d, NULL);
}
/**
@@ -238,6 +236,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
*siov = IOV_OF_LVALUE(payload->data);
+ tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp_eth_hdr[i]);
tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
tiov[UDP_IOV_PAYLOAD].iov_base = payload;
@@ -253,9 +252,6 @@ static void udp_iov_init(const struct ctx *c)
{
size_t i;
- udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP);
- udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6);
-
for (i = 0; i < UDP_MAX_FRAMES; i++)
udp_iov_init_one(c, i);
}
@@ -352,31 +348,34 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
* udp_tap_prepare() - Convert one datagram into a tap frame
* @mmh: Receiving mmsghdr array
* @idx: Index of the datagram to prepare
+ * @tap_omac: MAC address of remote endpoint as seen from the guest
* @toside: Flowside for destination side
* @no_udp_csum: Do not set UDP checksum
*/
static void udp_tap_prepare(const struct mmsghdr *mmh,
- unsigned idx, const struct flowside *toside,
+ unsigned int idx,
+ const uint8_t *tap_omac,
+ const struct flowside *toside,
bool no_udp_csum)
{
struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
struct udp_payload_t *bp = &udp_payload[idx];
struct udp_meta_t *bm = &udp_meta[idx];
+ struct ethhdr *eh = (*tap_iov)[UDP_IOV_ETH].iov_base;
size_t l4len;
+ eth_update_mac(eh, NULL, tap_omac);
if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
- sizeof(udp6_eth_hdr));
- (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
+ tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + ETH_HLEN);
+ eh->h_proto = htons_constant(ETH_P_IPV6);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
} else {
l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
- sizeof(udp4_eth_hdr));
- (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
+ tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + ETH_HLEN);
+ eh->h_proto = htons_constant(ETH_P_IP);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h);
}
(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
@@ -801,13 +800,19 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
flow_sidx_t tosidx)
{
const struct flowside *toside = flowside_at_sidx(tosidx);
+ struct udp_flow *uflow = udp_at_sidx(tosidx);
+ uint8_t *omac = uflow->f.tap_omac;
int i;
if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
return;
+ /* Make one attempt to find true MAC address in ARP/NDP table */
+ if (mac_undefined(omac))
+ fwd_neigh_mac_get(c, &toside->oaddr, omac);
+
for (i = 0; i < n; i++)
- udp_tap_prepare(udp_mh_recv, i, toside, false);
+ udp_tap_prepare(udp_mh_recv, i, omac, toside, false);
tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
}
diff --git a/udp.h b/udp.h
index 8f8531a..dd6e5ad 100644
--- a/udp.h
+++ b/udp.h
@@ -21,7 +21,7 @@ int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
const char *ifname, in_port_t port);
int udp_init(struct ctx *c);
void udp_timer(struct ctx *c, const struct timespec *now);
-void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
+void udp_update_l2_buf(const unsigned char *eth_d);
/**
* union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets
--
@@ -21,7 +21,7 @@ int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
const char *ifname, in_port_t port);
int udp_init(struct ctx *c);
void udp_timer(struct ctx *c, const struct timespec *now);
-void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
+void udp_update_l2_buf(const unsigned char *eth_d);
/**
* union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 06/10] udp: forward external source MAC address through tap interface
2025-09-06 2:11 ` [PATCH v5 06/10] udp: forward external source MAC address through tap interface Jon Maloy
@ 2025-09-08 3:13 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 3:13 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 6905 bytes --]
On Fri, Sep 05, 2025 at 10:11:50PM -0400, Jon Maloy wrote:
> We forward the incoming MAC address through the tap interface when
> receiving incoming packets from network local hosts.
>
> This is a part of the solution to bug
> https://bugs.passt.top/show_bug.cgi?id=120
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Modulo any changes that are needed because of changes in the earlier
patches.
> ---
> v3: - Adapted to the move of external MAC address from struct flowside
> to struct flow_common
> v4: - Changed signature of udp_tap_prepare() to take a MAC address
> instead of a flow.
> - Eliminated initialization of MAC source address in all frames,
> since those now are set per send occasion anyway.
> v5: - Added lookup in ARP/NDP table on incoming messages in
> case flow->tap_omac wasn't initialized at flow creation,
> i.e., the flow was initiated from the guest.
> ---
> passt.c | 2 +-
> udp.c | 45 +++++++++++++++++++++++++--------------------
> udp.h | 2 +-
> 3 files changed, 27 insertions(+), 22 deletions(-)
>
> diff --git a/passt.c b/passt.c
> index a4ec115..2a28e20 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -154,7 +154,7 @@ static void timer_init(struct ctx *c, const struct timespec *now)
> void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> {
> tcp_update_l2_buf(eth_d, eth_s);
> - udp_update_l2_buf(eth_d, eth_s);
> + udp_update_l2_buf(eth_d);
> }
>
> /**
> diff --git a/udp.c b/udp.c
> index 86585b7..7d98845 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -133,11 +133,8 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
> /* UDP header and data for inbound messages */
> static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
>
> -/* Ethernet header for IPv4 frames */
> -static struct ethhdr udp4_eth_hdr;
> -
> -/* Ethernet header for IPv6 frames */
> -static struct ethhdr udp6_eth_hdr;
> +/* Ethernet headers for IPv4 and IPv6 frames */
> +static struct ethhdr udp_eth_hdr[UDP_MAX_FRAMES];
>
> /**
> * struct udp_meta_t - Pre-cooked headers for UDP packets
> @@ -210,12 +207,13 @@ void udp_portmap_clear(void)
> /**
> * udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
> * @eth_d: Ethernet destination address, NULL if unchanged
> - * @eth_s: Ethernet source address, NULL if unchanged
> */
> -void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> +void udp_update_l2_buf(const unsigned char *eth_d)
> {
> - eth_update_mac(&udp4_eth_hdr, eth_d, eth_s);
> - eth_update_mac(&udp6_eth_hdr, eth_d, eth_s);
> + int i;
> +
> + for (i = 0; i < UDP_MAX_FRAMES; i++)
> + eth_update_mac(&udp_eth_hdr[i], eth_d, NULL);
> }
>
> /**
> @@ -238,6 +236,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
>
> *siov = IOV_OF_LVALUE(payload->data);
>
> + tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp_eth_hdr[i]);
> tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
> tiov[UDP_IOV_PAYLOAD].iov_base = payload;
>
> @@ -253,9 +252,6 @@ static void udp_iov_init(const struct ctx *c)
> {
> size_t i;
>
> - udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP);
> - udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6);
> -
> for (i = 0; i < UDP_MAX_FRAMES; i++)
> udp_iov_init_one(c, i);
> }
> @@ -352,31 +348,34 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
> * udp_tap_prepare() - Convert one datagram into a tap frame
> * @mmh: Receiving mmsghdr array
> * @idx: Index of the datagram to prepare
> + * @tap_omac: MAC address of remote endpoint as seen from the guest
> * @toside: Flowside for destination side
> * @no_udp_csum: Do not set UDP checksum
> */
> static void udp_tap_prepare(const struct mmsghdr *mmh,
> - unsigned idx, const struct flowside *toside,
> + unsigned int idx,
> + const uint8_t *tap_omac,
> + const struct flowside *toside,
> bool no_udp_csum)
> {
> struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
> struct udp_payload_t *bp = &udp_payload[idx];
> struct udp_meta_t *bm = &udp_meta[idx];
> + struct ethhdr *eh = (*tap_iov)[UDP_IOV_ETH].iov_base;
> size_t l4len;
>
> + eth_update_mac(eh, NULL, tap_omac);
> if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
> l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
> mmh[idx].msg_len, no_udp_csum);
> - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
> - sizeof(udp6_eth_hdr));
> - (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
> + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + ETH_HLEN);
> + eh->h_proto = htons_constant(ETH_P_IPV6);
> (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
> } else {
> l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
> mmh[idx].msg_len, no_udp_csum);
> - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
> - sizeof(udp4_eth_hdr));
> - (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
> + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + ETH_HLEN);
> + eh->h_proto = htons_constant(ETH_P_IP);
> (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h);
> }
> (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
> @@ -801,13 +800,19 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
> flow_sidx_t tosidx)
> {
> const struct flowside *toside = flowside_at_sidx(tosidx);
> + struct udp_flow *uflow = udp_at_sidx(tosidx);
> + uint8_t *omac = uflow->f.tap_omac;
> int i;
>
> if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
> return;
>
> + /* Make one attempt to find true MAC address in ARP/NDP table */
> + if (mac_undefined(omac))
> + fwd_neigh_mac_get(c, &toside->oaddr, omac);
> +
> for (i = 0; i < n; i++)
> - udp_tap_prepare(udp_mh_recv, i, toside, false);
> + udp_tap_prepare(udp_mh_recv, i, omac, toside, false);
>
> tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
> }
> diff --git a/udp.h b/udp.h
> index 8f8531a..dd6e5ad 100644
> --- a/udp.h
> +++ b/udp.h
> @@ -21,7 +21,7 @@ int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
> const char *ifname, in_port_t port);
> int udp_init(struct ctx *c);
> void udp_timer(struct ctx *c, const struct timespec *now);
> -void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
> +void udp_update_l2_buf(const unsigned char *eth_d);
>
> /**
> * union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 07/10] tcp: forward external source MAC address through tap interface
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
` (5 preceding siblings ...)
2025-09-06 2:11 ` [PATCH v5 06/10] udp: forward external source MAC address through tap interface Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 3:18 ` David Gibson
2025-09-06 2:11 ` [PATCH v5 08/10] tap: change signature of function tap_push_l2h() Jon Maloy
` (2 subsequent siblings)
9 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
We forward the incoming mac address through the tap interface when
receiving incoming packets from network local hosts.
This is a part of the solution to bug
https://bugs.passt.top/show_bug.cgi?id=120
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v3: - Adapted to the signature change in nl_mac_get() function, so that
we now consider only the template interface when checking the
ARP/NDP table.
v4: - Adapted to previous name changes in this series
v5: - Added lookup in ARP/NDP cache and/or table on incoming messages
in case flow->tap_omac wasn't initialized at flow creation,
i.e., the flow was initiated from the guest.
---
passt.c | 7 +++----
passt.h | 3 +--
pasta.c | 2 +-
tap.c | 2 +-
tcp.c | 13 +++++++++++--
tcp.h | 2 +-
tcp_buf.c | 37 +++++++++++++++++--------------------
tcp_internal.h | 4 ++--
tcp_vu.c | 5 ++---
9 files changed, 39 insertions(+), 36 deletions(-)
diff --git a/passt.c b/passt.c
index 2a28e20..adf7b19 100644
--- a/passt.c
+++ b/passt.c
@@ -149,11 +149,10 @@ static void timer_init(struct ctx *c, const struct timespec *now)
/**
* proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void proto_update_l2_buf(const unsigned char *eth_d)
{
- tcp_update_l2_buf(eth_d, eth_s);
+ tcp_update_l2_buf(eth_d);
udp_update_l2_buf(eth_d);
}
@@ -256,7 +255,7 @@ int main(int argc, char **argv)
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
_exit(EXIT_FAILURE);
- proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
+ proto_update_l2_buf(c.guest_mac);
if (c.ifi4 && !c.no_dhcp)
dhcp_init();
diff --git a/passt.h b/passt.h
index 4cfd6eb..2c5b3e1 100644
--- a/passt.h
+++ b/passt.h
@@ -324,7 +324,6 @@ struct ctx {
bool migrate_exit;
};
-void proto_update_l2_buf(const unsigned char *eth_d,
- const unsigned char *eth_s);
+void proto_update_l2_buf(const unsigned char *eth_d);
#endif /* PASST_H */
diff --git a/pasta.c b/pasta.c
index 687406b..a42cfd8 100644
--- a/pasta.c
+++ b/pasta.c
@@ -411,7 +411,7 @@ void pasta_ns_conf(struct ctx *c)
}
}
- proto_update_l2_buf(c->guest_mac, NULL);
+ proto_update_l2_buf(c->guest_mac);
}
/**
diff --git a/tap.c b/tap.c
index 7ba6399..74557e1 100644
--- a/tap.c
+++ b/tap.c
@@ -1097,7 +1097,7 @@ void tap_add_packet(struct ctx *c, struct iov_tail *data,
if (memcmp(c->guest_mac, eh->h_source, ETH_ALEN)) {
memcpy(c->guest_mac, eh->h_source, ETH_ALEN);
- proto_update_l2_buf(c->guest_mac, NULL);
+ proto_update_l2_buf(c->guest_mac);
}
switch (ntohs(eh->h_proto)) {
diff --git a/tcp.c b/tcp.c
index dba5fdc..23b45b8 100644
--- a/tcp.c
+++ b/tcp.c
@@ -919,6 +919,7 @@ static void tcp_fill_header(struct tcphdr *th,
/**
* tcp_fill_headers() - Fill 802.3, IP, TCP headers
+ * @c: Execution context
* @conn: Connection pointer
* @taph: tap backend specific header
* @ip4h: Pointer to IPv4 header, or NULL
@@ -929,14 +930,15 @@ static void tcp_fill_header(struct tcphdr *th,
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*/
-void tcp_fill_headers(const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
+void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+ struct tap_hdr *taph, struct ethhdr *eh,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
{
const struct flowside *tapside = TAPFLOW(conn);
size_t l4len = iov_tail_size(payload) + sizeof(*th);
+ uint8_t *omac = conn->f.tap_omac;
size_t l3len = l4len;
uint32_t psum = 0;
@@ -962,6 +964,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
*src4, *dst4);
}
+ eh->h_proto = htons_constant(ETH_P_IP);
}
if (ip6h) {
@@ -982,8 +985,14 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
&ip6h->saddr,
&ip6h->daddr);
}
+ eh->h_proto = htons_constant(ETH_P_IPV6);
}
+ /* Make one attempt to find true MAC address in ARP/NDP table */
+ if (mac_undefined(omac))
+ fwd_neigh_mac_get(c, &tapside->oaddr, omac);
+ eth_update_mac(eh, NULL, omac);
+
tcp_fill_header(th, conn, seq);
if (no_tcp_csum)
diff --git a/tcp.h b/tcp.h
index 234a803..c1b8385 100644
--- a/tcp.h
+++ b/tcp.h
@@ -24,7 +24,7 @@ int tcp_init(struct ctx *c);
void tcp_timer(struct ctx *c, const struct timespec *now);
void tcp_defer_handler(struct ctx *c);
-void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
+void tcp_update_l2_buf(const unsigned char *eth_d);
extern bool peek_offset_cap;
diff --git a/tcp_buf.c b/tcp_buf.c
index bc898de..7d8746e 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -40,8 +40,7 @@
/* Static buffers */
/* Ethernet header for IPv4 and IPv6 frames */
-static struct ethhdr tcp4_eth_src;
-static struct ethhdr tcp6_eth_src;
+static struct ethhdr tcp_eth_hdr[TCP_FRAMES_MEM];
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
@@ -67,12 +66,13 @@ static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
/**
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void tcp_update_l2_buf(const unsigned char *eth_d)
{
- eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
- eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
+ int i;
+
+ for (i = 0; i < TCP_FRAMES_MEM; i++)
+ eth_update_mac(&tcp_eth_hdr[i], eth_d, NULL);
}
/**
@@ -85,9 +85,6 @@ void tcp_sock_iov_init(const struct ctx *c)
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
int i;
- tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
- tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
-
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
tcp6_payload_ip[i] = ip6;
tcp4_payload_ip[i] = iph;
@@ -149,13 +146,15 @@ void tcp_payload_flush(const struct ctx *c)
/**
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
+ * @c: Execution context
* @conn: Connection pointer
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
* @check: Checksum, if already known
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*/
-static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
+static void tcp_l2_buf_fill_headers(const struct ctx *c,
+ struct tcp_tap_conn *conn,
struct iovec *iov, const uint16_t *check,
uint32_t seq, bool no_tcp_csum)
{
@@ -164,6 +163,7 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
const struct flowside *tapside = TAPFLOW(conn);
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
+ struct ethhdr *eh = iov[TCP_IOV_ETH].iov_base;
struct ipv6hdr *ip6h = NULL;
struct iphdr *ip4h = NULL;
@@ -172,7 +172,7 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
else
ip6h = iov[TCP_IOV_IP].iov_base;
- tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail,
+ tcp_fill_headers(c, conn, taph, eh, ip4h, ip6h, th, &tail,
check, seq, no_tcp_csum);
}
@@ -194,14 +194,12 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
int ret;
iov = tcp_l2_iov[tcp_payload_used];
- if (CONN_V4(conn)) {
+ if (CONN_V4(conn))
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
- } else {
+ else
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
- }
+ iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp_eth_hdr[tcp_payload_used]);
payload = iov[TCP_IOV_PAYLOAD].iov_base;
seq = conn->seq_to_tap;
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
@@ -212,7 +210,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
tcp_payload_used++;
l4len = optlen + sizeof(struct tcphdr);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
- tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
+ tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false);
if (flags & DUP_ACK) {
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
@@ -259,11 +257,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
check = &iph->check;
}
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else if (CONN_V6(conn)) {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
+ iov[TCP_IOV_ETH].iov_base = &tcp_eth_hdr[tcp_payload_used];
payload = iov[TCP_IOV_PAYLOAD].iov_base;
payload->th.th_off = sizeof(struct tcphdr) / 4;
payload->th.th_x2 = 0;
@@ -271,7 +268,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
payload->th.ack = 1;
payload->th.psh = push;
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
- tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
+ tcp_l2_buf_fill_headers(c, conn, iov, check, seq, false);
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
}
diff --git a/tcp_internal.h b/tcp_internal.h
index 36c6533..25f4cae 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -166,8 +166,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
struct tcp_info_linux;
-void tcp_fill_headers(const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
+void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+ struct tap_hdr *taph, struct ethhdr *eh,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum);
diff --git a/tcp_vu.c b/tcp_vu.c
index cb39bc2..c7e289d 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -135,7 +135,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
- tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+ tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
NULL, seq, !*c->pcap);
if (*c->pcap) {
@@ -315,7 +315,6 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
eh = vu_eth(base);
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
- memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
/* initialize header */
@@ -339,7 +338,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
th->ack = 1;
th->psh = push;
- tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+ tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
*check, conn->seq_to_tap, no_tcp_csum);
if (ip4h)
*check = &ip4h->check;
--
@@ -135,7 +135,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
- tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+ tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
NULL, seq, !*c->pcap);
if (*c->pcap) {
@@ -315,7 +315,6 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
eh = vu_eth(base);
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
- memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
/* initialize header */
@@ -339,7 +338,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
th->ack = 1;
th->psh = push;
- tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+ tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
*check, conn->seq_to_tap, no_tcp_csum);
if (ip4h)
*check = &ip4h->check;
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 07/10] tcp: forward external source MAC address through tap interface
2025-09-06 2:11 ` [PATCH v5 07/10] tcp: " Jon Maloy
@ 2025-09-08 3:18 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 3:18 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 12794 bytes --]
On Fri, Sep 05, 2025 at 10:11:51PM -0400, Jon Maloy wrote:
> We forward the incoming mac address through the tap interface when
> receiving incoming packets from network local hosts.
>
> This is a part of the solution to bug
> https://bugs.passt.top/show_bug.cgi?id=120
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
LGTM, one observation below.
>
> ---
> v3: - Adapted to the signature change in nl_mac_get() function, so that
> we now consider only the template interface when checking the
> ARP/NDP table.
> v4: - Adapted to previous name changes in this series
> v5: - Added lookup in ARP/NDP cache and/or table on incoming messages
> in case flow->tap_omac wasn't initialized at flow creation,
> i.e., the flow was initiated from the guest.
> ---
> passt.c | 7 +++----
> passt.h | 3 +--
> pasta.c | 2 +-
> tap.c | 2 +-
> tcp.c | 13 +++++++++++--
> tcp.h | 2 +-
> tcp_buf.c | 37 +++++++++++++++++--------------------
> tcp_internal.h | 4 ++--
> tcp_vu.c | 5 ++---
> 9 files changed, 39 insertions(+), 36 deletions(-)
>
> diff --git a/passt.c b/passt.c
> index 2a28e20..adf7b19 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -149,11 +149,10 @@ static void timer_init(struct ctx *c, const struct timespec *now)
> /**
> * proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers
> * @eth_d: Ethernet destination address, NULL if unchanged
> - * @eth_s: Ethernet source address, NULL if unchanged
> */
> -void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> +void proto_update_l2_buf(const unsigned char *eth_d)
> {
> - tcp_update_l2_buf(eth_d, eth_s);
> + tcp_update_l2_buf(eth_d);
> udp_update_l2_buf(eth_d);
> }
>
> @@ -256,7 +255,7 @@ int main(int argc, char **argv)
> if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
> _exit(EXIT_FAILURE);
>
> - proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
> + proto_update_l2_buf(c.guest_mac);
>
> if (c.ifi4 && !c.no_dhcp)
> dhcp_init();
> diff --git a/passt.h b/passt.h
> index 4cfd6eb..2c5b3e1 100644
> --- a/passt.h
> +++ b/passt.h
> @@ -324,7 +324,6 @@ struct ctx {
> bool migrate_exit;
> };
>
> -void proto_update_l2_buf(const unsigned char *eth_d,
> - const unsigned char *eth_s);
> +void proto_update_l2_buf(const unsigned char *eth_d);
>
> #endif /* PASST_H */
> diff --git a/pasta.c b/pasta.c
> index 687406b..a42cfd8 100644
> --- a/pasta.c
> +++ b/pasta.c
> @@ -411,7 +411,7 @@ void pasta_ns_conf(struct ctx *c)
> }
> }
>
> - proto_update_l2_buf(c->guest_mac, NULL);
> + proto_update_l2_buf(c->guest_mac);
> }
>
> /**
> diff --git a/tap.c b/tap.c
> index 7ba6399..74557e1 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -1097,7 +1097,7 @@ void tap_add_packet(struct ctx *c, struct iov_tail *data,
>
> if (memcmp(c->guest_mac, eh->h_source, ETH_ALEN)) {
> memcpy(c->guest_mac, eh->h_source, ETH_ALEN);
> - proto_update_l2_buf(c->guest_mac, NULL);
> + proto_update_l2_buf(c->guest_mac);
> }
>
> switch (ntohs(eh->h_proto)) {
> diff --git a/tcp.c b/tcp.c
> index dba5fdc..23b45b8 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -919,6 +919,7 @@ static void tcp_fill_header(struct tcphdr *th,
>
> /**
> * tcp_fill_headers() - Fill 802.3, IP, TCP headers
> + * @c: Execution context
> * @conn: Connection pointer
> * @taph: tap backend specific header
> * @ip4h: Pointer to IPv4 header, or NULL
> @@ -929,14 +930,15 @@ static void tcp_fill_header(struct tcphdr *th,
> * @seq: Sequence number for this segment
> * @no_tcp_csum: Do not set TCP checksum
> */
> -void tcp_fill_headers(const struct tcp_tap_conn *conn,
> - struct tap_hdr *taph,
> +void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
> + struct tap_hdr *taph, struct ethhdr *eh,
> struct iphdr *ip4h, struct ipv6hdr *ip6h,
> struct tcphdr *th, struct iov_tail *payload,
> const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
> {
> const struct flowside *tapside = TAPFLOW(conn);
> size_t l4len = iov_tail_size(payload) + sizeof(*th);
> + uint8_t *omac = conn->f.tap_omac;
> size_t l3len = l4len;
> uint32_t psum = 0;
>
> @@ -962,6 +964,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
> psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
> *src4, *dst4);
> }
> + eh->h_proto = htons_constant(ETH_P_IP);
> }
>
> if (ip6h) {
> @@ -982,8 +985,14 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
> &ip6h->saddr,
> &ip6h->daddr);
> }
> + eh->h_proto = htons_constant(ETH_P_IPV6);
> }
>
> + /* Make one attempt to find true MAC address in ARP/NDP table */
> + if (mac_undefined(omac))
> + fwd_neigh_mac_get(c, &tapside->oaddr, omac);
> + eth_update_mac(eh, NULL, omac);
It might be nice to do this in tcp_{buf,vu}_data_from_sock() rather
than here. It does mean we duplicate it in two places, but avoids
calling mac_undefined() for every packet.
> +
> tcp_fill_header(th, conn, seq);
>
> if (no_tcp_csum)
> diff --git a/tcp.h b/tcp.h
> index 234a803..c1b8385 100644
> --- a/tcp.h
> +++ b/tcp.h
> @@ -24,7 +24,7 @@ int tcp_init(struct ctx *c);
> void tcp_timer(struct ctx *c, const struct timespec *now);
> void tcp_defer_handler(struct ctx *c);
>
> -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
> +void tcp_update_l2_buf(const unsigned char *eth_d);
>
> extern bool peek_offset_cap;
>
> diff --git a/tcp_buf.c b/tcp_buf.c
> index bc898de..7d8746e 100644
> --- a/tcp_buf.c
> +++ b/tcp_buf.c
> @@ -40,8 +40,7 @@
> /* Static buffers */
>
> /* Ethernet header for IPv4 and IPv6 frames */
> -static struct ethhdr tcp4_eth_src;
> -static struct ethhdr tcp6_eth_src;
> +static struct ethhdr tcp_eth_hdr[TCP_FRAMES_MEM];
>
> static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
>
> @@ -67,12 +66,13 @@ static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
> /**
> * tcp_update_l2_buf() - Update Ethernet header buffers with addresses
> * @eth_d: Ethernet destination address, NULL if unchanged
> - * @eth_s: Ethernet source address, NULL if unchanged
> */
> -void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> +void tcp_update_l2_buf(const unsigned char *eth_d)
> {
> - eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
> - eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
> + int i;
> +
> + for (i = 0; i < TCP_FRAMES_MEM; i++)
> + eth_update_mac(&tcp_eth_hdr[i], eth_d, NULL);
> }
>
> /**
> @@ -85,9 +85,6 @@ void tcp_sock_iov_init(const struct ctx *c)
> struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
> int i;
>
> - tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
> - tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
> -
> for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
> tcp6_payload_ip[i] = ip6;
> tcp4_payload_ip[i] = iph;
> @@ -149,13 +146,15 @@ void tcp_payload_flush(const struct ctx *c)
>
> /**
> * tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
> + * @c: Execution context
> * @conn: Connection pointer
> * @iov: Pointer to an array of iovec of TCP pre-cooked buffers
> * @check: Checksum, if already known
> * @seq: Sequence number for this segment
> * @no_tcp_csum: Do not set TCP checksum
> */
> -static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
> +static void tcp_l2_buf_fill_headers(const struct ctx *c,
> + struct tcp_tap_conn *conn,
> struct iovec *iov, const uint16_t *check,
> uint32_t seq, bool no_tcp_csum)
> {
> @@ -164,6 +163,7 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
> struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
> const struct flowside *tapside = TAPFLOW(conn);
> const struct in_addr *a4 = inany_v4(&tapside->oaddr);
> + struct ethhdr *eh = iov[TCP_IOV_ETH].iov_base;
> struct ipv6hdr *ip6h = NULL;
> struct iphdr *ip4h = NULL;
>
> @@ -172,7 +172,7 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
> else
> ip6h = iov[TCP_IOV_IP].iov_base;
>
> - tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail,
> + tcp_fill_headers(c, conn, taph, eh, ip4h, ip6h, th, &tail,
> check, seq, no_tcp_csum);
> }
>
> @@ -194,14 +194,12 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
> int ret;
>
> iov = tcp_l2_iov[tcp_payload_used];
> - if (CONN_V4(conn)) {
> + if (CONN_V4(conn))
> iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
> - iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> - } else {
> + else
> iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
> - iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> - }
>
> + iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp_eth_hdr[tcp_payload_used]);
> payload = iov[TCP_IOV_PAYLOAD].iov_base;
> seq = conn->seq_to_tap;
> ret = tcp_prepare_flags(c, conn, flags, &payload->th,
> @@ -212,7 +210,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
> tcp_payload_used++;
> l4len = optlen + sizeof(struct tcphdr);
> iov[TCP_IOV_PAYLOAD].iov_len = l4len;
> - tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
> + tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false);
>
> if (flags & DUP_ACK) {
> struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used++];
> @@ -259,11 +257,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> check = &iph->check;
> }
> iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
> - iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
> } else if (CONN_V6(conn)) {
> iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
> - iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
> }
> + iov[TCP_IOV_ETH].iov_base = &tcp_eth_hdr[tcp_payload_used];
> payload = iov[TCP_IOV_PAYLOAD].iov_base;
> payload->th.th_off = sizeof(struct tcphdr) / 4;
> payload->th.th_x2 = 0;
> @@ -271,7 +268,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
> payload->th.ack = 1;
> payload->th.psh = push;
> iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
> - tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
> + tcp_l2_buf_fill_headers(c, conn, iov, check, seq, false);
> if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
> tcp_payload_flush(c);
> }
> diff --git a/tcp_internal.h b/tcp_internal.h
> index 36c6533..25f4cae 100644
> --- a/tcp_internal.h
> +++ b/tcp_internal.h
> @@ -166,8 +166,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
>
> struct tcp_info_linux;
>
> -void tcp_fill_headers(const struct tcp_tap_conn *conn,
> - struct tap_hdr *taph,
> +void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
> + struct tap_hdr *taph, struct ethhdr *eh,
> struct iphdr *ip4h, struct ipv6hdr *ip6h,
> struct tcphdr *th, struct iov_tail *payload,
> const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum);
> diff --git a/tcp_vu.c b/tcp_vu.c
> index cb39bc2..c7e289d 100644
> --- a/tcp_vu.c
> +++ b/tcp_vu.c
> @@ -135,7 +135,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
> flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
> payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
>
> - tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
> + tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
> NULL, seq, !*c->pcap);
>
> if (*c->pcap) {
> @@ -315,7 +315,6 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
> eh = vu_eth(base);
>
> memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
> - memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
>
> /* initialize header */
>
> @@ -339,7 +338,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
> th->ack = 1;
> th->psh = push;
>
> - tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
> + tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
> *check, conn->seq_to_tap, no_tcp_csum);
> if (ip4h)
> *check = &ip4h->check;
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 08/10] tap: change signature of function tap_push_l2h()
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
` (6 preceding siblings ...)
2025-09-06 2:11 ` [PATCH v5 07/10] tcp: " Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 3:21 ` David Gibson
2025-09-06 2:11 ` [PATCH v5 09/10] tcp: make tcp_rst_no_conn() respond with correct MAC address Jon Maloy
2025-09-06 2:11 ` [PATCH v5 10/10] icmp: let icmp use mac address from flowside structure Jon Maloy
9 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
In the following commits it must be possible for the callers of
function tap_push_l2h() to specify which source MAC address should
be added to the ethernet header sent over the tap interface. As
a preparation, we now add a new argument to that function, still
without actually using it.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v3: -Improved comment for src_mac argument, as suggested by Stefano.
v4: -Re-added comment in tap.c and remove redundant argument check
in tap_push_l2()
v5: -Ensured that MAC argument to tap_push_l2h() never is NULL.
---
tap.c | 15 +++++++++------
tap.h | 3 ++-
tcp.c | 5 +++--
3 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/tap.c b/tap.c
index 74557e1..05429aa 100644
--- a/tap.c
+++ b/tap.c
@@ -171,17 +171,20 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
* tap_push_l2h() - Build an L2 header for an inbound packet
* @c: Execution context
* @buf: Buffer address at which to generate header
+ * @src_mac: MAC address to be used as source for message.
* @proto: Ethernet protocol number for L3
*
* Return: pointer at which to write the packet's payload
*/
-void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
+void *tap_push_l2h(const struct ctx *c, void *buf,
+ const void *src_mac, uint16_t proto)
{
struct ethhdr *eh = (struct ethhdr *)buf;
/* TODO: ARP table lookup */
+
memcpy(eh->h_dest, c->guest_mac, ETH_ALEN);
- memcpy(eh->h_source, c->our_tap_mac, ETH_ALEN);
+ memcpy(eh->h_source, src_mac, ETH_ALEN);
eh->h_proto = ntohs(proto);
return eh + 1;
}
@@ -261,7 +264,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
{
size_t l4len = dlen + sizeof(struct udphdr);
char buf[USHRT_MAX];
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
@@ -281,7 +284,7 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
const void *in, size_t l4len)
{
char buf[USHRT_MAX];
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst,
l4len, IPPROTO_ICMP);
@@ -367,7 +370,7 @@ void tap_udp6_send(const struct ctx *c,
{
size_t l4len = dlen + sizeof(struct udphdr);
char buf[USHRT_MAX];
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
l4len, IPPROTO_UDP, flow);
char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
@@ -389,7 +392,7 @@ void tap_icmp6_send(const struct ctx *c,
const void *in, size_t l4len)
{
char buf[USHRT_MAX];
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, l4len,
IPPROTO_ICMPV6, 0);
diff --git a/tap.h b/tap.h
index 21db4d2..02f7761 100644
--- a/tap.h
+++ b/tap.h
@@ -70,7 +70,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
}
unsigned long tap_l2_max_len(const struct ctx *c);
-void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
+void *tap_push_l2h(const struct ctx *c, void *buf,
+ const void *src_mac, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto);
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
diff --git a/tcp.c b/tcp.c
index 23b45b8..383654c 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1922,7 +1922,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
return;
if (af == AF_INET) {
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
+ ETH_P_IP);
const struct in_addr *rst_src = daddr;
const struct in_addr *rst_dst = saddr;
@@ -1932,7 +1933,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
*rst_src, *rst_dst);
} else {
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
const struct in6_addr *rst_src = daddr;
const struct in6_addr *rst_dst = saddr;
--
@@ -1922,7 +1922,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
return;
if (af == AF_INET) {
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
+ ETH_P_IP);
const struct in_addr *rst_src = daddr;
const struct in_addr *rst_dst = saddr;
@@ -1932,7 +1933,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
*rst_src, *rst_dst);
} else {
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
const struct in6_addr *rst_src = daddr;
const struct in6_addr *rst_dst = saddr;
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 08/10] tap: change signature of function tap_push_l2h()
2025-09-06 2:11 ` [PATCH v5 08/10] tap: change signature of function tap_push_l2h() Jon Maloy
@ 2025-09-08 3:21 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 3:21 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 5630 bytes --]
On Fri, Sep 05, 2025 at 10:11:52PM -0400, Jon Maloy wrote:
> In the following commits it must be possible for the callers of
> function tap_push_l2h() to specify which source MAC address should
> be added to the ethernet header sent over the tap interface. As
> a preparation, we now add a new argument to that function, still
> without actually using it.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
LGTM.
> ---
> v3: -Improved comment for src_mac argument, as suggested by Stefano.
> v4: -Re-added comment in tap.c and remove redundant argument check
> in tap_push_l2()
> v5: -Ensured that MAC argument to tap_push_l2h() never is NULL.
> ---
> tap.c | 15 +++++++++------
> tap.h | 3 ++-
> tcp.c | 5 +++--
> 3 files changed, 14 insertions(+), 9 deletions(-)
>
> diff --git a/tap.c b/tap.c
> index 74557e1..05429aa 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -171,17 +171,20 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
> * tap_push_l2h() - Build an L2 header for an inbound packet
> * @c: Execution context
> * @buf: Buffer address at which to generate header
> + * @src_mac: MAC address to be used as source for message.
> * @proto: Ethernet protocol number for L3
> *
> * Return: pointer at which to write the packet's payload
> */
> -void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
> +void *tap_push_l2h(const struct ctx *c, void *buf,
> + const void *src_mac, uint16_t proto)
> {
> struct ethhdr *eh = (struct ethhdr *)buf;
>
> /* TODO: ARP table lookup */
Kind of pre-existing, but the new context of this series makes this
comment a bit misleading. We now *have* an ARP table of sorts, but it
wouldn't help here - our new ARP table is host side, but this is guest
side. What this comment is talking about is not so much an ARP
_table_ lookup as an actual ARP lookup on the tap interface (which
would involve a caching table, but also sending actual packets).
> +
Spurious whitespace change.
> memcpy(eh->h_dest, c->guest_mac, ETH_ALEN);
> - memcpy(eh->h_source, c->our_tap_mac, ETH_ALEN);
> + memcpy(eh->h_source, src_mac, ETH_ALEN);
> eh->h_proto = ntohs(proto);
> return eh + 1;
> }
> @@ -261,7 +264,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
> {
> size_t l4len = dlen + sizeof(struct udphdr);
> char buf[USHRT_MAX];
> - struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
> + struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
> struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
> char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
>
> @@ -281,7 +284,7 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
> const void *in, size_t l4len)
> {
> char buf[USHRT_MAX];
> - struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
> + struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
> struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst,
> l4len, IPPROTO_ICMP);
>
> @@ -367,7 +370,7 @@ void tap_udp6_send(const struct ctx *c,
> {
> size_t l4len = dlen + sizeof(struct udphdr);
> char buf[USHRT_MAX];
> - struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
> + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
> struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
> l4len, IPPROTO_UDP, flow);
> char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
> @@ -389,7 +392,7 @@ void tap_icmp6_send(const struct ctx *c,
> const void *in, size_t l4len)
> {
> char buf[USHRT_MAX];
> - struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
> + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
> struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, l4len,
> IPPROTO_ICMPV6, 0);
>
> diff --git a/tap.h b/tap.h
> index 21db4d2..02f7761 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -70,7 +70,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
> }
>
> unsigned long tap_l2_max_len(const struct ctx *c);
> -void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
> +void *tap_push_l2h(const struct ctx *c, void *buf,
> + const void *src_mac, uint16_t proto);
> void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
> struct in_addr dst, size_t l4len, uint8_t proto);
> void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
> diff --git a/tcp.c b/tcp.c
> index 23b45b8..383654c 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1922,7 +1922,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
> return;
>
> if (af == AF_INET) {
> - struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
> + struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
> + ETH_P_IP);
> const struct in_addr *rst_src = daddr;
> const struct in_addr *rst_dst = saddr;
>
> @@ -1932,7 +1933,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
> *rst_src, *rst_dst);
>
> } else {
> - struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
> + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
> const struct in6_addr *rst_src = daddr;
> const struct in6_addr *rst_dst = saddr;
>
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 09/10] tcp: make tcp_rst_no_conn() respond with correct MAC address
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
` (7 preceding siblings ...)
2025-09-06 2:11 ` [PATCH v5 08/10] tap: change signature of function tap_push_l2h() Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 3:29 ` David Gibson
2025-09-06 2:11 ` [PATCH v5 10/10] icmp: let icmp use mac address from flowside structure Jon Maloy
9 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
tcp_rst_no_conn() needs to identify and specify which source MAC
address to use when sending an RST to the guest. This is because
it doesn't have access to any flow structure where this address
could be fetched.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v3: - Adapted to the signature change in nl_mac_get() function, so that
we now consider only the template interface when checking the
ARP/NDP table.
v4: - Adapted to previous name changes in this series
v5: - Eliminated use of function fwd_iany_nat().
- Instead using the translation result of an attempted NAT lookup.
---
tcp.c | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/tcp.c b/tcp.c
index 383654c..54e75bb 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1912,6 +1912,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
const struct tcphdr *th, size_t l4len)
{
struct iov_tail payload = IOV_TAIL(NULL, 0, 0);
+ unsigned char src_mac[ETH_ALEN];
+ union inany_addr tgt, tgt_nat;
struct tcphdr *rsth;
char buf[USHRT_MAX];
uint32_t psum = 0;
@@ -1921,9 +1923,15 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
if (th->rst)
return;
+ /* Try to use true MAC address if remote host's address or
+ * NAT translated address can be found in ARP/NDP table.
+ */
+ inany_from_af(&tgt, af, daddr);
+ nat_outbound(c, &tgt, &tgt_nat);
+ fwd_neigh_mac_get(c, &tgt_nat, src_mac);
+
if (af == AF_INET) {
- struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
- ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
const struct in_addr *rst_src = daddr;
const struct in_addr *rst_dst = saddr;
@@ -1933,7 +1941,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
*rst_src, *rst_dst);
} else {
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, src_mac, ETH_P_IPV6);
const struct in6_addr *rst_src = daddr;
const struct in6_addr *rst_dst = saddr;
--
@@ -1912,6 +1912,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
const struct tcphdr *th, size_t l4len)
{
struct iov_tail payload = IOV_TAIL(NULL, 0, 0);
+ unsigned char src_mac[ETH_ALEN];
+ union inany_addr tgt, tgt_nat;
struct tcphdr *rsth;
char buf[USHRT_MAX];
uint32_t psum = 0;
@@ -1921,9 +1923,15 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
if (th->rst)
return;
+ /* Try to use true MAC address if remote host's address or
+ * NAT translated address can be found in ARP/NDP table.
+ */
+ inany_from_af(&tgt, af, daddr);
+ nat_outbound(c, &tgt, &tgt_nat);
+ fwd_neigh_mac_get(c, &tgt_nat, src_mac);
+
if (af == AF_INET) {
- struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
- ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
const struct in_addr *rst_src = daddr;
const struct in_addr *rst_dst = saddr;
@@ -1933,7 +1941,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
*rst_src, *rst_dst);
} else {
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, src_mac, ETH_P_IPV6);
const struct in6_addr *rst_src = daddr;
const struct in6_addr *rst_dst = saddr;
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 09/10] tcp: make tcp_rst_no_conn() respond with correct MAC address
2025-09-06 2:11 ` [PATCH v5 09/10] tcp: make tcp_rst_no_conn() respond with correct MAC address Jon Maloy
@ 2025-09-08 3:29 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 3:29 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 3296 bytes --]
On Fri, Sep 05, 2025 at 10:11:53PM -0400, Jon Maloy wrote:
> tcp_rst_no_conn() needs to identify and specify which source MAC
> address to use when sending an RST to the guest. This is because
> it doesn't have access to any flow structure where this address
> could be fetched.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> ---
> v3: - Adapted to the signature change in nl_mac_get() function, so that
> we now consider only the template interface when checking the
> ARP/NDP table.
> v4: - Adapted to previous name changes in this series
> v5: - Eliminated use of function fwd_iany_nat().
> - Instead using the translation result of an attempted NAT lookup.
> ---
> tcp.c | 14 +++++++++++---
> 1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/tcp.c b/tcp.c
> index 383654c..54e75bb 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1912,6 +1912,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
> const struct tcphdr *th, size_t l4len)
> {
> struct iov_tail payload = IOV_TAIL(NULL, 0, 0);
> + unsigned char src_mac[ETH_ALEN];
> + union inany_addr tgt, tgt_nat;
'tgt' is not a good name here. 'tgt' typically refers to the "target"
(non initiating) side of a flow. But in this instance there is no flow.
> struct tcphdr *rsth;
> char buf[USHRT_MAX];
> uint32_t psum = 0;
> @@ -1921,9 +1923,15 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
> if (th->rst)
> return;
>
> + /* Try to use true MAC address if remote host's address or
> + * NAT translated address can be found in ARP/NDP table.
> + */
> + inany_from_af(&tgt, af, daddr);
> + nat_outbound(c, &tgt, &tgt_nat);
> + fwd_neigh_mac_get(c, &tgt_nat, src_mac);
I'm not convinced we actually want to do a MAC lookup in this case.
In this case the guest has send us a packet that looks bogus. The RST
is explicitly something we're synthesizing, not something that's
_actually_ from the peer. We have to lie about the IP address or the
guest won't attach this RST to the right connection, but I don't see
that we need to lie about the MAC address as well - especially since
it has a cost to do so.
Or to look at it another way, if we get here the guest's idea of what
the surrounding network is already out of sync with passt's idea. So
trying to preserve that idea by faking the MAC is kind of pointless.
> if (af == AF_INET) {
> - struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
> - ETH_P_IP);
> + struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
> const struct in_addr *rst_src = daddr;
> const struct in_addr *rst_dst = saddr;
>
> @@ -1933,7 +1941,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
> *rst_src, *rst_dst);
>
> } else {
> - struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
> + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, src_mac, ETH_P_IPV6);
> const struct in6_addr *rst_src = daddr;
> const struct in6_addr *rst_dst = saddr;
>
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v5 10/10] icmp: let icmp use mac address from flowside structure
2025-09-06 2:11 [PATCH v5 00/10] Use true MAC address of LAN local remote hosts Jon Maloy
` (8 preceding siblings ...)
2025-09-06 2:11 ` [PATCH v5 09/10] tcp: make tcp_rst_no_conn() respond with correct MAC address Jon Maloy
@ 2025-09-06 2:11 ` Jon Maloy
2025-09-08 3:35 ` David Gibson
9 siblings, 1 reply; 24+ messages in thread
From: Jon Maloy @ 2025-09-06 2:11 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
Even ICMP needs to be updated to use the external MAC address instead
of just the own tap address when applicable. We do that here.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v3: - Adapted to the move of external MAC address from struct flowside
to struct flow_common
v4: - Adapted to name changes in previous commits in this series
v5: - Added conditional lookup in ARP/NDP if the flow's tap_omac is
undefined
---
icmp.c | 8 ++++++--
ndp.c | 2 +-
tap.c | 10 ++++++----
tap.h | 4 ++--
udp.c | 12 ++++++++----
5 files changed, 23 insertions(+), 13 deletions(-)
diff --git a/icmp.c b/icmp.c
index 6dffafb..2d5bf0c 100644
--- a/icmp.c
+++ b/icmp.c
@@ -125,17 +125,21 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
flow_dbg(pingf, "echo reply to tap, ID: %"PRIu16", seq: %"PRIu16,
ini->eport, seq);
+ /* Try to find true MAC address in ARP/NDP table if needed */
+ if (mac_undefined(pingf->f.tap_omac))
+ fwd_neigh_mac_get(c, &ini->oaddr, pingf->f.tap_omac);
+
if (pingf->f.type == FLOW_PING4) {
const struct in_addr *saddr = inany_v4(&ini->oaddr);
const struct in_addr *daddr = inany_v4(&ini->eaddr);
ASSERT(saddr && daddr); /* Must have IPv4 addresses */
- tap_icmp4_send(c, *saddr, *daddr, buf, n);
+ tap_icmp4_send(c, *saddr, *daddr, buf, pingf->f.tap_omac, n);
} else if (pingf->f.type == FLOW_PING6) {
const struct in6_addr *saddr = &ini->oaddr.a6;
const struct in6_addr *daddr = &ini->eaddr.a6;
- tap_icmp6_send(c, saddr, daddr, buf, n);
+ tap_icmp6_send(c, saddr, daddr, buf, pingf->f.tap_omac, n);
}
return;
diff --git a/ndp.c b/ndp.c
index 820c556..e0e8938 100644
--- a/ndp.c
+++ b/ndp.c
@@ -184,7 +184,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
{
const struct in6_addr *src = &c->ip6.our_tap_ll;
- tap_icmp6_send(c, src, dst, buf, l4len);
+ tap_icmp6_send(c, src, dst, buf, c->our_tap_mac, l4len);
}
/**
diff --git a/tap.c b/tap.c
index 05429aa..6443812 100644
--- a/tap.c
+++ b/tap.c
@@ -278,13 +278,14 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
* @src: IPv4 source address
* @dst: IPv4 destination address
* @in: ICMP packet, including ICMP header
+ * @src_mac: MAC address to be used as source for message
* @l4len: ICMP packet length, including ICMP header
*/
void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
- const void *in, size_t l4len)
+ const void *in, const void *src_mac, size_t l4len)
{
char buf[USHRT_MAX];
- struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst,
l4len, IPPROTO_ICMP);
@@ -385,14 +386,15 @@ void tap_udp6_send(const struct ctx *c,
* @src: IPv6 source address
* @dst: IPv6 destination address
* @in: ICMP packet, including ICMP header
+ * @src_mac: MAC address to be used as source for message
* @l4len: ICMP packet length, including ICMP header
*/
void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst,
- const void *in, size_t l4len)
+ const void *in, const void *src_mac, size_t l4len)
{
char buf[USHRT_MAX];
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, src_mac, ETH_P_IPV6);
struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, l4len,
IPPROTO_ICMPV6, 0);
diff --git a/tap.h b/tap.h
index 02f7761..1864173 100644
--- a/tap.h
+++ b/tap.h
@@ -91,7 +91,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
struct in_addr dst, in_port_t dport,
const void *in, size_t dlen);
void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
- const void *in, size_t l4len);
+ const void *in, const void *src_mac, size_t l4len);
const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
const struct in6_addr *src);
void *tap_push_ip6h(struct ipv6hdr *ip6h,
@@ -103,7 +103,7 @@ void tap_udp6_send(const struct ctx *c,
uint32_t flow, void *in, size_t dlen);
void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst,
- const void *in, size_t l4len);
+ const void *in, const void *src_mac, size_t l4len);
void tap_send_single(const struct ctx *c, const void *data, size_t l2len);
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
size_t bufs_per_frame, size_t nframes);
diff --git a/udp.c b/udp.c
index 7d98845..26f8c22 100644
--- a/udp.c
+++ b/udp.c
@@ -385,6 +385,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
* udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
* @c: Execution context
* @ee: Extended error descriptor
+ * @uflow: UDP flow
* @toside: Destination side of flow
* @saddr: Address of ICMP generating node
* @in: First bytes (max 8) of original UDP message body
@@ -392,6 +393,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
*/
static void udp_send_tap_icmp4(const struct ctx *c,
const struct sock_extended_err *ee,
+ const struct udp_flow *uflow,
const struct flowside *toside,
struct in_addr saddr,
const void *in, size_t dlen)
@@ -421,7 +423,7 @@ static void udp_send_tap_icmp4(const struct ctx *c,
tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
memcpy(&msg.data, in, dlen);
- tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
+ tap_icmp4_send(c, saddr, eaddr, &msg, uflow->f.tap_omac, msglen);
}
@@ -429,6 +431,7 @@ static void udp_send_tap_icmp4(const struct ctx *c,
* udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
* @c: Execution context
* @ee: Extended error descriptor
+ * @uflow: UDP flow
* @toside: Destination side of flow
* @saddr: Address of ICMP generating node
* @in: First bytes (max 1232) of original UDP message body
@@ -437,6 +440,7 @@ static void udp_send_tap_icmp4(const struct ctx *c,
*/
static void udp_send_tap_icmp6(const struct ctx *c,
const struct sock_extended_err *ee,
+ const struct udp_flow *uflow,
const struct flowside *toside,
const struct in6_addr *saddr,
void *in, size_t dlen, uint32_t flow)
@@ -466,7 +470,7 @@ static void udp_send_tap_icmp6(const struct ctx *c,
tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
memcpy(&msg.data, in, dlen);
- tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
+ tap_icmp6_send(c, saddr, eaddr, &msg, uflow->f.tap_omac, msglen);
}
/**
@@ -626,12 +630,12 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
if (hdr->cmsg_level == IPPROTO_IP &&
(o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) {
dlen = MIN(dlen, ICMP4_MAX_DLEN);
- udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
+ udp_send_tap_icmp4(c, ee, uflow, toside, *o4, data, dlen);
return 1;
}
if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
- udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen,
+ udp_send_tap_icmp6(c, ee, uflow, toside, &otap.a6, data, dlen,
FLOW_IDX(uflow));
return 1;
}
--
@@ -385,6 +385,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
* udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
* @c: Execution context
* @ee: Extended error descriptor
+ * @uflow: UDP flow
* @toside: Destination side of flow
* @saddr: Address of ICMP generating node
* @in: First bytes (max 8) of original UDP message body
@@ -392,6 +393,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
*/
static void udp_send_tap_icmp4(const struct ctx *c,
const struct sock_extended_err *ee,
+ const struct udp_flow *uflow,
const struct flowside *toside,
struct in_addr saddr,
const void *in, size_t dlen)
@@ -421,7 +423,7 @@ static void udp_send_tap_icmp4(const struct ctx *c,
tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
memcpy(&msg.data, in, dlen);
- tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
+ tap_icmp4_send(c, saddr, eaddr, &msg, uflow->f.tap_omac, msglen);
}
@@ -429,6 +431,7 @@ static void udp_send_tap_icmp4(const struct ctx *c,
* udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
* @c: Execution context
* @ee: Extended error descriptor
+ * @uflow: UDP flow
* @toside: Destination side of flow
* @saddr: Address of ICMP generating node
* @in: First bytes (max 1232) of original UDP message body
@@ -437,6 +440,7 @@ static void udp_send_tap_icmp4(const struct ctx *c,
*/
static void udp_send_tap_icmp6(const struct ctx *c,
const struct sock_extended_err *ee,
+ const struct udp_flow *uflow,
const struct flowside *toside,
const struct in6_addr *saddr,
void *in, size_t dlen, uint32_t flow)
@@ -466,7 +470,7 @@ static void udp_send_tap_icmp6(const struct ctx *c,
tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
memcpy(&msg.data, in, dlen);
- tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
+ tap_icmp6_send(c, saddr, eaddr, &msg, uflow->f.tap_omac, msglen);
}
/**
@@ -626,12 +630,12 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
if (hdr->cmsg_level == IPPROTO_IP &&
(o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) {
dlen = MIN(dlen, ICMP4_MAX_DLEN);
- udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
+ udp_send_tap_icmp4(c, ee, uflow, toside, *o4, data, dlen);
return 1;
}
if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
- udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen,
+ udp_send_tap_icmp6(c, ee, uflow, toside, &otap.a6, data, dlen,
FLOW_IDX(uflow));
return 1;
}
--
2.50.1
^ permalink raw reply related [flat|nested] 24+ messages in thread
* Re: [PATCH v5 10/10] icmp: let icmp use mac address from flowside structure
2025-09-06 2:11 ` [PATCH v5 10/10] icmp: let icmp use mac address from flowside structure Jon Maloy
@ 2025-09-08 3:35 ` David Gibson
0 siblings, 0 replies; 24+ messages in thread
From: David Gibson @ 2025-09-08 3:35 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 8514 bytes --]
On Fri, Sep 05, 2025 at 10:11:54PM -0400, Jon Maloy wrote:
> Even ICMP needs to be updated to use the external MAC address instead
> of just the own tap address when applicable. We do that here.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> ---
> v3: - Adapted to the move of external MAC address from struct flowside
> to struct flow_common
> v4: - Adapted to name changes in previous commits in this series
> v5: - Added conditional lookup in ARP/NDP if the flow's tap_omac is
> undefined
> ---
> icmp.c | 8 ++++++--
> ndp.c | 2 +-
> tap.c | 10 ++++++----
> tap.h | 4 ++--
> udp.c | 12 ++++++++----
> 5 files changed, 23 insertions(+), 13 deletions(-)
>
> diff --git a/icmp.c b/icmp.c
> index 6dffafb..2d5bf0c 100644
> --- a/icmp.c
> +++ b/icmp.c
> @@ -125,17 +125,21 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
> flow_dbg(pingf, "echo reply to tap, ID: %"PRIu16", seq: %"PRIu16,
> ini->eport, seq);
>
> + /* Try to find true MAC address in ARP/NDP table if needed */
> + if (mac_undefined(pingf->f.tap_omac))
> + fwd_neigh_mac_get(c, &ini->oaddr, pingf->f.tap_omac);
> +
> if (pingf->f.type == FLOW_PING4) {
> const struct in_addr *saddr = inany_v4(&ini->oaddr);
> const struct in_addr *daddr = inany_v4(&ini->eaddr);
>
> ASSERT(saddr && daddr); /* Must have IPv4 addresses */
> - tap_icmp4_send(c, *saddr, *daddr, buf, n);
> + tap_icmp4_send(c, *saddr, *daddr, buf, pingf->f.tap_omac, n);
> } else if (pingf->f.type == FLOW_PING6) {
> const struct in6_addr *saddr = &ini->oaddr.a6;
> const struct in6_addr *daddr = &ini->eaddr.a6;
>
> - tap_icmp6_send(c, saddr, daddr, buf, n);
> + tap_icmp6_send(c, saddr, daddr, buf, pingf->f.tap_omac, n);
> }
> return;
>
> diff --git a/ndp.c b/ndp.c
> index 820c556..e0e8938 100644
> --- a/ndp.c
> +++ b/ndp.c
> @@ -184,7 +184,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
> {
> const struct in6_addr *src = &c->ip6.our_tap_ll;
>
> - tap_icmp6_send(c, src, dst, buf, l4len);
> + tap_icmp6_send(c, src, dst, buf, c->our_tap_mac, l4len);
> }
>
> /**
> diff --git a/tap.c b/tap.c
> index 05429aa..6443812 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -278,13 +278,14 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
> * @src: IPv4 source address
> * @dst: IPv4 destination address
> * @in: ICMP packet, including ICMP header
> + * @src_mac: MAC address to be used as source for message
> * @l4len: ICMP packet length, including ICMP header
> */
> void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
> - const void *in, size_t l4len)
> + const void *in, const void *src_mac, size_t l4len)
> {
> char buf[USHRT_MAX];
> - struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
> + struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
> struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst,
> l4len, IPPROTO_ICMP);
>
> @@ -385,14 +386,15 @@ void tap_udp6_send(const struct ctx *c,
> * @src: IPv6 source address
> * @dst: IPv6 destination address
> * @in: ICMP packet, including ICMP header
> + * @src_mac: MAC address to be used as source for message
> * @l4len: ICMP packet length, including ICMP header
> */
> void tap_icmp6_send(const struct ctx *c,
> const struct in6_addr *src, const struct in6_addr *dst,
> - const void *in, size_t l4len)
> + const void *in, const void *src_mac, size_t l4len)
> {
> char buf[USHRT_MAX];
> - struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
> + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, src_mac, ETH_P_IPV6);
> struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, l4len,
> IPPROTO_ICMPV6, 0);
>
> diff --git a/tap.h b/tap.h
> index 02f7761..1864173 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -91,7 +91,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
> struct in_addr dst, in_port_t dport,
> const void *in, size_t dlen);
> void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
> - const void *in, size_t l4len);
> + const void *in, const void *src_mac, size_t l4len);
> const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
> const struct in6_addr *src);
> void *tap_push_ip6h(struct ipv6hdr *ip6h,
> @@ -103,7 +103,7 @@ void tap_udp6_send(const struct ctx *c,
> uint32_t flow, void *in, size_t dlen);
> void tap_icmp6_send(const struct ctx *c,
> const struct in6_addr *src, const struct in6_addr *dst,
> - const void *in, size_t l4len);
> + const void *in, const void *src_mac, size_t l4len);
> void tap_send_single(const struct ctx *c, const void *data, size_t l2len);
> size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
> size_t bufs_per_frame, size_t nframes);
> diff --git a/udp.c b/udp.c
> index 7d98845..26f8c22 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -385,6 +385,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
> * udp_send_tap_icmp4() - Construct and send ICMPv4 to local peer
> * @c: Execution context
> * @ee: Extended error descriptor
> + * @uflow: UDP flow
> * @toside: Destination side of flow
> * @saddr: Address of ICMP generating node
> * @in: First bytes (max 8) of original UDP message body
> @@ -392,6 +393,7 @@ static void udp_tap_prepare(const struct mmsghdr *mmh,
> */
> static void udp_send_tap_icmp4(const struct ctx *c,
> const struct sock_extended_err *ee,
> + const struct udp_flow *uflow,
> const struct flowside *toside,
> struct in_addr saddr,
> const void *in, size_t dlen)
> @@ -421,7 +423,7 @@ static void udp_send_tap_icmp4(const struct ctx *c,
> tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
> memcpy(&msg.data, in, dlen);
>
> - tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
> + tap_icmp4_send(c, saddr, eaddr, &msg, uflow->f.tap_omac, msglen);
As I noted on an earlier version, this is only correct if the ICMP is
coming from the peer, not an intermediate host. You'd need to look up
the MAC based on saddr, which could be different from the peer
address.
Or... I suspect it's probably good enough to just use our_tap_mac in
the case that (saddr != peer address).
> }
>
>
> @@ -429,6 +431,7 @@ static void udp_send_tap_icmp4(const struct ctx *c,
> * udp_send_tap_icmp6() - Construct and send ICMPv6 to local peer
> * @c: Execution context
> * @ee: Extended error descriptor
> + * @uflow: UDP flow
> * @toside: Destination side of flow
> * @saddr: Address of ICMP generating node
> * @in: First bytes (max 1232) of original UDP message body
> @@ -437,6 +440,7 @@ static void udp_send_tap_icmp4(const struct ctx *c,
> */
> static void udp_send_tap_icmp6(const struct ctx *c,
> const struct sock_extended_err *ee,
> + const struct udp_flow *uflow,
> const struct flowside *toside,
> const struct in6_addr *saddr,
> void *in, size_t dlen, uint32_t flow)
> @@ -466,7 +470,7 @@ static void udp_send_tap_icmp6(const struct ctx *c,
> tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
> memcpy(&msg.data, in, dlen);
>
> - tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
> + tap_icmp6_send(c, saddr, eaddr, &msg, uflow->f.tap_omac, msglen);
Same thing here.
> }
>
> /**
> @@ -626,12 +630,12 @@ static int udp_sock_recverr(const struct ctx *c, int s, flow_sidx_t sidx,
> if (hdr->cmsg_level == IPPROTO_IP &&
> (o4 = inany_v4(&otap)) && inany_v4(&toside->eaddr)) {
> dlen = MIN(dlen, ICMP4_MAX_DLEN);
> - udp_send_tap_icmp4(c, ee, toside, *o4, data, dlen);
> + udp_send_tap_icmp4(c, ee, uflow, toside, *o4, data, dlen);
> return 1;
> }
>
> if (hdr->cmsg_level == IPPROTO_IPV6 && !inany_v4(&toside->eaddr)) {
> - udp_send_tap_icmp6(c, ee, toside, &otap.a6, data, dlen,
> + udp_send_tap_icmp6(c, ee, uflow, toside, &otap.a6, data, dlen,
> FLOW_IDX(uflow));
> return 1;
> }
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 24+ messages in thread