* [PATCH v11 1/9] netlink: add subsciption on changes in NDP/ARP table
2025-09-27 19:25 [PATCH v11 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
@ 2025-09-27 19:25 ` Jon Maloy
2025-09-29 4:29 ` David Gibson
2025-09-29 23:58 ` Stefano Brivio
2025-09-27 19:25 ` [PATCH v11 2/9] fwd: Add cache table for ARP/NDP contents Jon Maloy
` (7 subsequent siblings)
8 siblings, 2 replies; 27+ messages in thread
From: Jon Maloy @ 2025-09-27 19:25 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
The solution to bug https://bugs.passt.top/show_bug.cgi?id=120
requires the ability to translate from an IP address to its
corresponding MAC address in cases where those are present in
the ARP or NDP tables.
To keep track of the contents of these tables we add a netlink
based neighbour subscription feature.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v3: - Added an attribute contianing NDA_DST to sent message, so
that we let the kernel do the filtering of the IP address
and return only one entry.
- Added interface index to the call signature. Since the only
interface we know is the template interface, this limits
the number of hosts that will be seen as 'network segment
local' from a PASST viewpoint.
v4: - Made loop independent of attribute order.
- Ignoring L2 addresses which are not of size ETH_ALEN.
v5: - Changed return value of new function, so caller can know if
a MAC address really was found.
v6: - Removed warning printout which had ended up in the wrong
commit.
v8: - Changed to neighbour event subscription model
- netlink: arp/ndp table subscription
v10:- Updated according to David's latest comments on v8
- Added functionaly where we initially read current
state of ARP/NDP tables
---
epoll_type.h | 2 +
netlink.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++
netlink.h | 4 ++
passt.c | 8 +++
4 files changed, 194 insertions(+)
diff --git a/epoll_type.h b/epoll_type.h
index 12ac59b..a90ffb6 100644
--- a/epoll_type.h
+++ b/epoll_type.h
@@ -44,6 +44,8 @@ enum epoll_type {
EPOLL_TYPE_REPAIR_LISTEN,
/* TCP_REPAIR helper socket */
EPOLL_TYPE_REPAIR,
+ /* Netlink neighbour subscription socket */
+ EPOLL_TYPE_NL_NEIGH,
EPOLL_NUM_TYPES,
};
diff --git a/netlink.c b/netlink.c
index c436780..0dc4e9d 100644
--- a/netlink.c
+++ b/netlink.c
@@ -26,6 +26,7 @@
#include <arpa/inet.h>
#include <netinet/in.h>
#include <netinet/if_ether.h>
+#include <net/if_arp.h>
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
@@ -40,6 +41,10 @@
#define RTNH_NEXT_AND_DEC(rtnh, attrlen) \
((attrlen) -= RTNH_ALIGN((rtnh)->rtnh_len), RTNH_NEXT(rtnh))
+/* Convenience macro borrowed from kernel */
+#define NUD_VALID (NUD_PERMANENT | NUD_NOARP | \
+ NUD_REACHABLE | NUD_PROBE | NUD_STALE)
+
/* Netlink expects a buffer of at least 8kiB or the system page size,
* whichever is larger. 32kiB is recommended for more efficient.
* Since the largest page size on any remotely common Linux setup is
@@ -53,6 +58,7 @@
int nl_sock = -1;
int nl_sock_ns = -1;
static int nl_seq = 1;
+static int nl_sock_neigh = -1;
/**
* nl_sock_init_do() - Set up netlink sockets in init or target namespace
@@ -1103,3 +1109,177 @@ int nl_link_set_flags(int s, unsigned int ifi,
return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
}
+
+/**
+ * nl_neigh_msg_read() - Interpret a neigbour state message from netlink
+ *
+ */
+static void nl_neigh_msg_read(const struct ctx *c, struct nlmsghdr *nh)
+{
+ struct ndmsg *ndm = NLMSG_DATA(nh);
+ struct rtattr *rta = (struct rtattr *)(ndm + 1);
+ size_t na = NLMSG_PAYLOAD(nh, sizeof(*ndm));
+ char ip_str[INET6_ADDRSTRLEN];
+ char mac_str[ETH_ADDRSTRLEN];
+ const uint8_t *lladdr = NULL;
+ const void *dst = NULL;
+ size_t lladdr_len = 0;
+ uint8_t mac[ETH_ALEN];
+ union inany_addr addr;
+ size_t dstlen = 0;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ return;
+
+ if (nh->nlmsg_type == NLMSG_ERROR) {
+ warn("NLMSG type error");
+ return;
+ }
+
+ if (nh->nlmsg_type != RTM_NEWNEIGH &&
+ nh->nlmsg_type != RTM_DELNEIGH)
+ return;
+
+ for (; RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) {
+ switch (rta->rta_type) {
+ case NDA_DST:
+ dst = RTA_DATA(rta);
+ dstlen = RTA_PAYLOAD(rta);
+ break;
+ case NDA_LLADDR:
+ lladdr = RTA_DATA(rta);
+ lladdr_len = RTA_PAYLOAD(rta);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!dst)
+ return;
+
+ if (!lladdr || lladdr_len != ETH_ALEN)
+ return;
+
+ if (ndm->ndm_type != ARPHRD_ETHER)
+ return;
+
+ memcpy(mac, lladdr, ETH_ALEN);
+ eth_ntop(mac, mac_str, sizeof(mac_str));
+
+ if (ndm->ndm_family == AF_INET &&
+ (dstlen != sizeof(struct in_addr) ||
+ ndm->ndm_ifindex != c->ifi4))
+ return;
+
+ if (ndm->ndm_family == AF_INET6 &&
+ (dstlen != sizeof(struct in6_addr) ||
+ ndm->ndm_ifindex != c->ifi6))
+ return;
+
+ if (ndm->ndm_family == AF_INET)
+ addr = inany_from_v4(*(struct in_addr *) dst);
+ else if (ndm->ndm_family == AF_INET6)
+ addr.a6 = *(struct in6_addr *) dst;
+ else
+ return;
+
+ inet_ntop(ndm->ndm_family, dst, ip_str, sizeof(ip_str));
+
+ if (nh->nlmsg_type == RTM_NEWNEIGH &&
+ ndm->ndm_state & NUD_VALID) {
+ trace("neigh table update: %s / %s", ip_str, mac_str);
+ } else {
+ trace("neigh table delete: %s / %s", ip_str, mac_str);
+ }
+}
+
+/**
+ * nl_neigh_subscr_sync() - Read current contents ARP/NDP tables
+ *
+ */
+static void nl_neigh_subscr_sync(const struct ctx *c, int proto, int ifi)
+{
+ struct {
+ struct nlmsghdr nlh;
+ struct ndmsg ndm;
+ } req = { 0 };
+ struct nlmsghdr *nh;
+ char buf[NLBUFSIZ];
+ ssize_t status;
+ uint32_t seq;
+
+ req.ndm.ndm_family = proto;
+ req.ndm.ndm_ifindex = ifi;
+ seq = nl_send(nl_sock_neigh, &req, RTM_GETNEIGH, NLM_F_DUMP, sizeof(req));
+ nl_foreach_oftype(nh, status, nl_sock_neigh, buf, seq, RTM_NEWNEIGH)
+ nl_neigh_msg_read(c, nh);
+}
+
+/**
+ * nl_neigh_subscr_handler() - Non-blocking drain of pending neighbour updates
+ * @c: Execution context
+ */
+void nl_neigh_subscr_handler(struct ctx *c)
+{
+ struct nlmsghdr *nh;
+ char buf[NLBUFSIZ];
+ ssize_t n;
+
+ for (;;) {
+ n = recv(nl_sock_neigh, buf, sizeof(buf), MSG_DONTWAIT);
+ if (n < 0) {
+ if (errno != EAGAIN)
+ warn_perror("Failed to read from netlink sock");
+ return;
+ }
+ nh = (struct nlmsghdr *)buf;
+ for (; NLMSG_OK(nh, n); nh = NLMSG_NEXT(nh, n))
+ nl_neigh_msg_read(c, nh);
+ }
+}
+
+/**
+ * nl_neigh_subscr_init() - Open netlink sock and subscribe to neighbour events
+ *
+ * Return: 0 on success, -1 on failure
+ */
+int nl_neigh_subscr_init(struct ctx *c)
+{
+ union epoll_ref ref = {
+ .type = EPOLL_TYPE_NL_NEIGH
+ };
+ struct epoll_event ev = {
+ .events = EPOLLIN
+ };
+ struct sockaddr_nl addr = {
+ .nl_family = AF_NETLINK,
+ .nl_groups = RTMGRP_NEIGH,
+ };
+
+ if (nl_sock_neigh >= 0)
+ return 0;
+
+ nl_sock_neigh = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+ if (nl_sock_neigh < 0)
+ return -1;
+
+ if (bind(nl_sock_neigh, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ close(nl_sock_neigh);
+ nl_sock_neigh = -1;
+ return -1;
+ }
+
+ ref.fd = nl_sock_neigh;
+ ev.data.u64 = ref.u64;
+ if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, nl_sock_neigh, &ev) == -1) {
+ close(nl_sock_neigh);
+ nl_sock_neigh = -1;
+ return -1;
+ }
+
+ nl_neigh_subscr_sync(c, AF_INET, c->ifi4);
+ nl_neigh_subscr_sync(c, AF_INET6, c->ifi6);
+
+ return 0;
+}
diff --git a/netlink.h b/netlink.h
index b51e99c..a7d3506 100644
--- a/netlink.h
+++ b/netlink.h
@@ -17,6 +17,8 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
int s_dst, unsigned int ifi_dst, sa_family_t af);
int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
void *addr, int *prefix_len, void *addr_l);
+bool nl_neigh_mac_get(int s, const union inany_addr *addr, int ifi,
+ unsigned char *mac);
int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
const void *addr, int prefix_len);
int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
@@ -28,5 +30,7 @@ int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
int nl_link_set_mtu(int s, unsigned int ifi, int mtu);
int nl_link_set_flags(int s, unsigned int ifi,
unsigned int set, unsigned int change);
+int nl_neigh_subscr_init(struct ctx *c);
+void nl_neigh_subscr_handler(struct ctx *c);
#endif /* NETLINK_H */
diff --git a/passt.c b/passt.c
index 31fbb75..e20bbad 100644
--- a/passt.c
+++ b/passt.c
@@ -53,6 +53,7 @@
#include "vu_common.h"
#include "migrate.h"
#include "repair.h"
+#include "netlink.h"
#define EPOLL_EVENTS 8
@@ -79,6 +80,7 @@ char *epoll_type_str[] = {
[EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
[EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
[EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
+ [EPOLL_TYPE_NL_NEIGH] = "netlink neighbour subscription socket",
};
static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
"epoll_type_str[] doesn't match enum epoll_type");
@@ -322,6 +324,9 @@ int main(int argc, char **argv)
pcap_init(&c);
+ if (nl_neigh_subscr_init(&c) < 0)
+ warn("Failed to subscribe to RTMGRP_NEIGH");
+
if (!c.foreground) {
if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0)
die_perror("Failed to open /dev/null");
@@ -414,6 +419,9 @@ loop:
case EPOLL_TYPE_REPAIR:
repair_handler(&c, eventmask);
break;
+ case EPOLL_TYPE_NL_NEIGH:
+ nl_neigh_subscr_handler(&c);
+ break;
default:
/* Can't happen */
ASSERT(0);
--
2.50.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 1/9] netlink: add subsciption on changes in NDP/ARP table
2025-09-27 19:25 ` [PATCH v11 1/9] netlink: add subsciption on changes in NDP/ARP table Jon Maloy
@ 2025-09-29 4:29 ` David Gibson
2025-09-29 23:58 ` Stefano Brivio
1 sibling, 0 replies; 27+ messages in thread
From: David Gibson @ 2025-09-29 4:29 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 11508 bytes --]
On Sat, Sep 27, 2025 at 03:25:14PM -0400, Jon Maloy wrote:
> The solution to bug https://bugs.passt.top/show_bug.cgi?id=120
> requires the ability to translate from an IP address to its
> corresponding MAC address in cases where those are present in
> the ARP or NDP tables.
>
> To keep track of the contents of these tables we add a netlink
> based neighbour subscription feature.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Although there are some suggestions for polish below.
>
> ---
> v3: - Added an attribute contianing NDA_DST to sent message, so
> that we let the kernel do the filtering of the IP address
> and return only one entry.
> - Added interface index to the call signature. Since the only
> interface we know is the template interface, this limits
> the number of hosts that will be seen as 'network segment
> local' from a PASST viewpoint.
> v4: - Made loop independent of attribute order.
> - Ignoring L2 addresses which are not of size ETH_ALEN.
> v5: - Changed return value of new function, so caller can know if
> a MAC address really was found.
> v6: - Removed warning printout which had ended up in the wrong
> commit.
> v8: - Changed to neighbour event subscription model
> - netlink: arp/ndp table subscription
> v10:- Updated according to David's latest comments on v8
> - Added functionaly where we initially read current
> state of ARP/NDP tables
> ---
> epoll_type.h | 2 +
> netlink.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++
> netlink.h | 4 ++
> passt.c | 8 +++
> 4 files changed, 194 insertions(+)
>
> diff --git a/epoll_type.h b/epoll_type.h
> index 12ac59b..a90ffb6 100644
> --- a/epoll_type.h
> +++ b/epoll_type.h
> @@ -44,6 +44,8 @@ enum epoll_type {
> EPOLL_TYPE_REPAIR_LISTEN,
> /* TCP_REPAIR helper socket */
> EPOLL_TYPE_REPAIR,
> + /* Netlink neighbour subscription socket */
> + EPOLL_TYPE_NL_NEIGH,
>
> EPOLL_NUM_TYPES,
> };
> diff --git a/netlink.c b/netlink.c
> index c436780..0dc4e9d 100644
> --- a/netlink.c
> +++ b/netlink.c
> @@ -26,6 +26,7 @@
> #include <arpa/inet.h>
> #include <netinet/in.h>
> #include <netinet/if_ether.h>
> +#include <net/if_arp.h>
>
> #include <linux/netlink.h>
> #include <linux/rtnetlink.h>
> @@ -40,6 +41,10 @@
> #define RTNH_NEXT_AND_DEC(rtnh, attrlen) \
> ((attrlen) -= RTNH_ALIGN((rtnh)->rtnh_len), RTNH_NEXT(rtnh))
>
> +/* Convenience macro borrowed from kernel */
> +#define NUD_VALID (NUD_PERMANENT | NUD_NOARP | \
> + NUD_REACHABLE | NUD_PROBE | NUD_STALE)
> +
> /* Netlink expects a buffer of at least 8kiB or the system page size,
> * whichever is larger. 32kiB is recommended for more efficient.
> * Since the largest page size on any remotely common Linux setup is
> @@ -53,6 +58,7 @@
> int nl_sock = -1;
> int nl_sock_ns = -1;
> static int nl_seq = 1;
> +static int nl_sock_neigh = -1;
>
> /**
> * nl_sock_init_do() - Set up netlink sockets in init or target namespace
> @@ -1103,3 +1109,177 @@ int nl_link_set_flags(int s, unsigned int ifi,
>
> return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
> }
> +
> +/**
> + * nl_neigh_msg_read() - Interpret a neigbour state message from netlink
> + *
> + */
> +static void nl_neigh_msg_read(const struct ctx *c, struct nlmsghdr *nh)
> +{
> + struct ndmsg *ndm = NLMSG_DATA(nh);
> + struct rtattr *rta = (struct rtattr *)(ndm + 1);
> + size_t na = NLMSG_PAYLOAD(nh, sizeof(*ndm));
> + char ip_str[INET6_ADDRSTRLEN];
> + char mac_str[ETH_ADDRSTRLEN];
> + const uint8_t *lladdr = NULL;
> + const void *dst = NULL;
> + size_t lladdr_len = 0;
> + uint8_t mac[ETH_ALEN];
> + union inany_addr addr;
> + size_t dstlen = 0;
> +
> + if (nh->nlmsg_type == NLMSG_DONE)
> + return;
> +
> + if (nh->nlmsg_type == NLMSG_ERROR) {
> + warn("NLMSG type error");
This message could be a bit more helpful, e.g. saying that it's on the
neighbour watching socket. There's also an errno in the message (see
nl_status()).
> + return;
> + }
> +
> + if (nh->nlmsg_type != RTM_NEWNEIGH &&
> + nh->nlmsg_type != RTM_DELNEIGH)
> + return;
> +
> + for (; RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) {
> + switch (rta->rta_type) {
> + case NDA_DST:
> + dst = RTA_DATA(rta);
> + dstlen = RTA_PAYLOAD(rta);
> + break;
> + case NDA_LLADDR:
> + lladdr = RTA_DATA(rta);
> + lladdr_len = RTA_PAYLOAD(rta);
> + break;
> + default:
> + break;
> + }
> + }
> +
> + if (!dst)
> + return;
> +
> + if (!lladdr || lladdr_len != ETH_ALEN)
> + return;
> +
> + if (ndm->ndm_type != ARPHRD_ETHER)
> + return;
> +
> + memcpy(mac, lladdr, ETH_ALEN);
> + eth_ntop(mac, mac_str, sizeof(mac_str));
It would be nice to avoid the stringification if we're not printing
the trace message, but that is kind of awkward to do.
> + if (ndm->ndm_family == AF_INET &&
> + (dstlen != sizeof(struct in_addr) ||
> + ndm->ndm_ifindex != c->ifi4))
> + return;
> +
> + if (ndm->ndm_family == AF_INET6 &&
> + (dstlen != sizeof(struct in6_addr) ||
> + ndm->ndm_ifindex != c->ifi6))
> + return;
Hmm. These are basically correct, but make me uncomfortable, because
they're checking for two very different conditions. ifindex != ifi is
basically an expected thing; we get a message for an interface we
don't care about, ignore it and carry on.
The other condition - addr length doesn't match addr family means the
kernel has done something really unexpected. There's not much we can
do but ignore it (maybe log) and carry on, but lumping it together
with the expected cases seems misleading to me.
> +
> + if (ndm->ndm_family == AF_INET)
> + addr = inany_from_v4(*(struct in_addr *) dst);
> + else if (ndm->ndm_family == AF_INET6)
> + addr.a6 = *(struct in6_addr *) dst;
> + else
> + return;
You can use inany_from_af(&addr, ndm->ndm_family, dst)
You will need to exclude non-IP addresses beforehand, though.
> +
> + inet_ntop(ndm->ndm_family, dst, ip_str, sizeof(ip_str));
> +
> + if (nh->nlmsg_type == RTM_NEWNEIGH &&
> + ndm->ndm_state & NUD_VALID) {
> + trace("neigh table update: %s / %s", ip_str, mac_str);
> + } else {
> + trace("neigh table delete: %s / %s", ip_str, mac_str);
> + }
> +}
> +
> +/**
> + * nl_neigh_subscr_sync() - Read current contents ARP/NDP tables
> + *
> + */
> +static void nl_neigh_subscr_sync(const struct ctx *c, int proto, int ifi)
> +{
> + struct {
> + struct nlmsghdr nlh;
> + struct ndmsg ndm;
> + } req = { 0 };
> + struct nlmsghdr *nh;
> + char buf[NLBUFSIZ];
> + ssize_t status;
> + uint32_t seq;
> +
> + req.ndm.ndm_family = proto;
> + req.ndm.ndm_ifindex = ifi;
> + seq = nl_send(nl_sock_neigh, &req, RTM_GETNEIGH, NLM_F_DUMP, sizeof(req));
> + nl_foreach_oftype(nh, status, nl_sock_neigh, buf, seq, RTM_NEWNEIGH)
> + nl_neigh_msg_read(c, nh);
Should probably check status afterwards, and log an error if necessary.
> +}
> +
> +/**
> + * nl_neigh_subscr_handler() - Non-blocking drain of pending neighbour updates
> + * @c: Execution context
> + */
> +void nl_neigh_subscr_handler(struct ctx *c)
> +{
> + struct nlmsghdr *nh;
> + char buf[NLBUFSIZ];
> + ssize_t n;
> +
> + for (;;) {
> + n = recv(nl_sock_neigh, buf, sizeof(buf), MSG_DONTWAIT);
> + if (n < 0) {
> + if (errno != EAGAIN)
> + warn_perror("Failed to read from netlink sock");
> + return;
> + }
> + nh = (struct nlmsghdr *)buf;
> + for (; NLMSG_OK(nh, n); nh = NLMSG_NEXT(nh, n))
> + nl_neigh_msg_read(c, nh);
> + }
> +}
> +
> +/**
> + * nl_neigh_subscr_init() - Open netlink sock and subscribe to neighbour events
> + *
> + * Return: 0 on success, -1 on failure
> + */
> +int nl_neigh_subscr_init(struct ctx *c)
> +{
> + union epoll_ref ref = {
> + .type = EPOLL_TYPE_NL_NEIGH
> + };
> + struct epoll_event ev = {
> + .events = EPOLLIN
> + };
> + struct sockaddr_nl addr = {
> + .nl_family = AF_NETLINK,
> + .nl_groups = RTMGRP_NEIGH,
> + };
> +
> + if (nl_sock_neigh >= 0)
> + return 0;
> +
> + nl_sock_neigh = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
> + if (nl_sock_neigh < 0)
> + return -1;
> +
> + if (bind(nl_sock_neigh, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
> + close(nl_sock_neigh);
> + nl_sock_neigh = -1;
> + return -1;
> + }
> +
> + ref.fd = nl_sock_neigh;
> + ev.data.u64 = ref.u64;
> + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, nl_sock_neigh, &ev) == -1) {
> + close(nl_sock_neigh);
> + nl_sock_neigh = -1;
> + return -1;
> + }
> +
> + nl_neigh_subscr_sync(c, AF_INET, c->ifi4);
> + nl_neigh_subscr_sync(c, AF_INET6, c->ifi6);
> +
> + return 0;
> +}
> diff --git a/netlink.h b/netlink.h
> index b51e99c..a7d3506 100644
> --- a/netlink.h
> +++ b/netlink.h
> @@ -17,6 +17,8 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
> int s_dst, unsigned int ifi_dst, sa_family_t af);
> int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
> void *addr, int *prefix_len, void *addr_l);
> +bool nl_neigh_mac_get(int s, const union inany_addr *addr, int ifi,
> + unsigned char *mac);
> int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
> const void *addr, int prefix_len);
> int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
> @@ -28,5 +30,7 @@ int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
> int nl_link_set_mtu(int s, unsigned int ifi, int mtu);
> int nl_link_set_flags(int s, unsigned int ifi,
> unsigned int set, unsigned int change);
> +int nl_neigh_subscr_init(struct ctx *c);
> +void nl_neigh_subscr_handler(struct ctx *c);
>
> #endif /* NETLINK_H */
> diff --git a/passt.c b/passt.c
> index 31fbb75..e20bbad 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -53,6 +53,7 @@
> #include "vu_common.h"
> #include "migrate.h"
> #include "repair.h"
> +#include "netlink.h"
>
> #define EPOLL_EVENTS 8
>
> @@ -79,6 +80,7 @@ char *epoll_type_str[] = {
> [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
> [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
> [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
> + [EPOLL_TYPE_NL_NEIGH] = "netlink neighbour subscription socket",
> };
> static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
> "epoll_type_str[] doesn't match enum epoll_type");
> @@ -322,6 +324,9 @@ int main(int argc, char **argv)
>
> pcap_init(&c);
>
> + if (nl_neigh_subscr_init(&c) < 0)
> + warn("Failed to subscribe to RTMGRP_NEIGH");
I'd be inclined to put the error message inside
nl_neigh_subscr_init(), where it can be a bit more detailed as to what
went wrong where.
> +
> if (!c.foreground) {
> if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0)
> die_perror("Failed to open /dev/null");
> @@ -414,6 +419,9 @@ loop:
> case EPOLL_TYPE_REPAIR:
> repair_handler(&c, eventmask);
> break;
> + case EPOLL_TYPE_NL_NEIGH:
> + nl_neigh_subscr_handler(&c);
> + break;
> default:
> /* Can't happen */
> ASSERT(0);
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 1/9] netlink: add subsciption on changes in NDP/ARP table
2025-09-27 19:25 ` [PATCH v11 1/9] netlink: add subsciption on changes in NDP/ARP table Jon Maloy
2025-09-29 4:29 ` David Gibson
@ 2025-09-29 23:58 ` Stefano Brivio
1 sibling, 0 replies; 27+ messages in thread
From: Stefano Brivio @ 2025-09-29 23:58 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
On Sat, 27 Sep 2025 15:25:14 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> The solution to bug https://bugs.passt.top/show_bug.cgi?id=120
> requires the ability to translate from an IP address to its
> corresponding MAC address in cases where those are present in
> the ARP or NDP tables.
>
> To keep track of the contents of these tables we add a netlink
> based neighbour subscription feature.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> ---
> v3: - Added an attribute contianing NDA_DST to sent message, so
> that we let the kernel do the filtering of the IP address
> and return only one entry.
> - Added interface index to the call signature. Since the only
> interface we know is the template interface, this limits
> the number of hosts that will be seen as 'network segment
> local' from a PASST viewpoint.
> v4: - Made loop independent of attribute order.
> - Ignoring L2 addresses which are not of size ETH_ALEN.
> v5: - Changed return value of new function, so caller can know if
> a MAC address really was found.
> v6: - Removed warning printout which had ended up in the wrong
> commit.
> v8: - Changed to neighbour event subscription model
> - netlink: arp/ndp table subscription
> v10:- Updated according to David's latest comments on v8
> - Added functionaly where we initially read current
> state of ARP/NDP tables
> ---
> epoll_type.h | 2 +
> netlink.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++
> netlink.h | 4 ++
> passt.c | 8 +++
> 4 files changed, 194 insertions(+)
>
> diff --git a/epoll_type.h b/epoll_type.h
> index 12ac59b..a90ffb6 100644
> --- a/epoll_type.h
> +++ b/epoll_type.h
> @@ -44,6 +44,8 @@ enum epoll_type {
> EPOLL_TYPE_REPAIR_LISTEN,
> /* TCP_REPAIR helper socket */
> EPOLL_TYPE_REPAIR,
> + /* Netlink neighbour subscription socket */
> + EPOLL_TYPE_NL_NEIGH,
>
> EPOLL_NUM_TYPES,
> };
> diff --git a/netlink.c b/netlink.c
> index c436780..0dc4e9d 100644
> --- a/netlink.c
> +++ b/netlink.c
> @@ -26,6 +26,7 @@
> #include <arpa/inet.h>
> #include <netinet/in.h>
> #include <netinet/if_ether.h>
> +#include <net/if_arp.h>
>
> #include <linux/netlink.h>
> #include <linux/rtnetlink.h>
> @@ -40,6 +41,10 @@
> #define RTNH_NEXT_AND_DEC(rtnh, attrlen) \
> ((attrlen) -= RTNH_ALIGN((rtnh)->rtnh_len), RTNH_NEXT(rtnh))
>
> +/* Convenience macro borrowed from kernel */
> +#define NUD_VALID (NUD_PERMANENT | NUD_NOARP | \
> + NUD_REACHABLE | NUD_PROBE | NUD_STALE)
In general (see especially util.h) we do:
#define NUD_VALID \
(NUD_PERMANENT | NUD_NOARP | NUD_REACHABLE | NUD_PROBE | NUD_STALE)
...rationale: it was on one line, and now the most important part
remains on the same one line.
> +
> /* Netlink expects a buffer of at least 8kiB or the system page size,
> * whichever is larger. 32kiB is recommended for more efficient.
> * Since the largest page size on any remotely common Linux setup is
> @@ -53,6 +58,7 @@
> int nl_sock = -1;
> int nl_sock_ns = -1;
These two assignments are aligned for (debatable) eye candy. Given that
nl_seq is recycled for nl_sock_neigh as well, maybe nl_sock_neigh could
be moved here and you could add extra tabs to these two to align the -1
values again.
> static int nl_seq = 1;
> +static int nl_sock_neigh = -1;
>
> /**
> * nl_sock_init_do() - Set up netlink sockets in init or target namespace
> @@ -1103,3 +1109,177 @@ int nl_link_set_flags(int s, unsigned int ifi,
>
> return nl_do(s, &req, RTM_NEWLINK, 0, sizeof(req));
> }
> +
> +/**
> + * nl_neigh_msg_read() - Interpret a neigbour state message from netlink
The usual description of parameters is missing here.
> + *
> + */
> +static void nl_neigh_msg_read(const struct ctx *c, struct nlmsghdr *nh)
> +{
> + struct ndmsg *ndm = NLMSG_DATA(nh);
> + struct rtattr *rta = (struct rtattr *)(ndm + 1);
> + size_t na = NLMSG_PAYLOAD(nh, sizeof(*ndm));
> + char ip_str[INET6_ADDRSTRLEN];
> + char mac_str[ETH_ADDRSTRLEN];
> + const uint8_t *lladdr = NULL;
> + const void *dst = NULL;
> + size_t lladdr_len = 0;
> + uint8_t mac[ETH_ALEN];
> + union inany_addr addr;
> + size_t dstlen = 0;
> +
> + if (nh->nlmsg_type == NLMSG_DONE)
> + return;
> +
> + if (nh->nlmsg_type == NLMSG_ERROR) {
> + warn("NLMSG type error");
> + return;
> + }
> +
> + if (nh->nlmsg_type != RTM_NEWNEIGH &&
> + nh->nlmsg_type != RTM_DELNEIGH)
> + return;
> +
> + for (; RTA_OK(rta, na); rta = RTA_NEXT(rta, na)) {
> + switch (rta->rta_type) {
> + case NDA_DST:
> + dst = RTA_DATA(rta);
> + dstlen = RTA_PAYLOAD(rta);
> + break;
> + case NDA_LLADDR:
> + lladdr = RTA_DATA(rta);
> + lladdr_len = RTA_PAYLOAD(rta);
> + break;
> + default:
> + break;
> + }
> + }
> +
> + if (!dst)
> + return;
> +
> + if (!lladdr || lladdr_len != ETH_ALEN)
> + return;
> +
> + if (ndm->ndm_type != ARPHRD_ETHER)
> + return;
> +
> + memcpy(mac, lladdr, ETH_ALEN);
> + eth_ntop(mac, mac_str, sizeof(mac_str));
> +
> + if (ndm->ndm_family == AF_INET &&
> + (dstlen != sizeof(struct in_addr) ||
> + ndm->ndm_ifindex != c->ifi4))
> + return;
> +
> + if (ndm->ndm_family == AF_INET6 &&
> + (dstlen != sizeof(struct in6_addr) ||
> + ndm->ndm_ifindex != c->ifi6))
> + return;
> +
> + if (ndm->ndm_family == AF_INET)
> + addr = inany_from_v4(*(struct in_addr *) dst);
> + else if (ndm->ndm_family == AF_INET6)
> + addr.a6 = *(struct in6_addr *) dst;
We never add spaces between cast and variable name:
*(struct in6_addr *)dst;
> + else
> + return;
> +
> + inet_ntop(ndm->ndm_family, dst, ip_str, sizeof(ip_str));
> +
> + if (nh->nlmsg_type == RTM_NEWNEIGH &&
> + ndm->ndm_state & NUD_VALID) {
This fits on one line, and you can drop the curly brackets then.
> + trace("neigh table update: %s / %s", ip_str, mac_str);
> + } else {
> + trace("neigh table delete: %s / %s", ip_str, mac_str);
> + }
> +}
> +
> +/**
> + * nl_neigh_subscr_sync() - Read current contents ARP/NDP tables
...where "subscr" indicates the whole mechanism by referring to the
subscription socket, that is, some kind of synecdoche, but I wonder if
it's really helpful here. What about nl_neigh_sync()?
> + *
Usual description of arguments missing.
> + */
> +static void nl_neigh_subscr_sync(const struct ctx *c, int proto, int ifi)
> +{
> + struct {
> + struct nlmsghdr nlh;
> + struct ndmsg ndm;
> + } req = { 0 };
> + struct nlmsghdr *nh;
> + char buf[NLBUFSIZ];
> + ssize_t status;
> + uint32_t seq;
> +
> + req.ndm.ndm_family = proto;
> + req.ndm.ndm_ifindex = ifi;
You could initialise 'req' to those right away, see other functions in
netlink.c for examples (say, nl_route_get_def()).
> + seq = nl_send(nl_sock_neigh, &req, RTM_GETNEIGH, NLM_F_DUMP, sizeof(req));
> + nl_foreach_oftype(nh, status, nl_sock_neigh, buf, seq, RTM_NEWNEIGH)
> + nl_neigh_msg_read(c, nh);
> +}
> +
> +/**
> + * nl_neigh_subscr_handler() - Non-blocking drain of pending neighbour updates
This function name brings me to a different naming suggestion. Given
that you're handling notifications, what about
nl_neigh_notify_handler(), and nl_neigh_notify_init()? It's a bit
easier to read/write as it's not a truncated word.
> + * @c: Execution context
Mixed tab ^ and ^^^^^^ spaces.
> + */
> +void nl_neigh_subscr_handler(struct ctx *c)
Cppcheck (the guy from 'make cppcheck') says:
netlink.c:1225:42: style: Parameter 'c' can be declared as pointer to const [constParameterPointer]
void nl_neigh_subscr_handler(struct ctx *c)
and I think it actually makes sense.
> +{
> + struct nlmsghdr *nh;
> + char buf[NLBUFSIZ];
> + ssize_t n;
...it also says:
netlink.c:1227:19: style: The scope of the variable 'nh' can be reduced. [variableScope]
struct nlmsghdr *nh;
^
netlink.c:1229:10: style: The scope of the variable 'n' can be reduced. [variableScope]
ssize_t n;
^
> +
> + for (;;) {
> + n = recv(nl_sock_neigh, buf, sizeof(buf), MSG_DONTWAIT);
> + if (n < 0) {
> + if (errno != EAGAIN)
> + warn_perror("Failed to read from netlink sock");
> + return;
> + }
Can we ever get end-of-file for whatever reason? What about returning
on n <= 0 just to be on the safe side?
> + nh = (struct nlmsghdr *)buf;
> + for (; NLMSG_OK(nh, n); nh = NLMSG_NEXT(nh, n))
> + nl_neigh_msg_read(c, nh);
> + }
> +}
> +
> +/**
> + * nl_neigh_subscr_init() - Open netlink sock and subscribe to neighbour events
The fact we're opening the netlink socket here is kind of obvious. If
you just say we subscribe to neighbour events, it fits in 80 columns.
> + *
* @c: Execution context
> + * Return: 0 on success, -1 on failure
We're always getting something nice in errno for those, I would
actually suggest spending a moment right now to fetch that and return
as negative value so that we can use that in the caller if ever needed.
Should you do so, though, note that close() sets errno on failure.
> + */
> +int nl_neigh_subscr_init(struct ctx *c)
> +{
> + union epoll_ref ref = {
> + .type = EPOLL_TYPE_NL_NEIGH
> + };
> + struct epoll_event ev = {
> + .events = EPOLLIN
> + };
> + struct sockaddr_nl addr = {
> + .nl_family = AF_NETLINK,
> + .nl_groups = RTMGRP_NEIGH,
> + };
> +
> + if (nl_sock_neigh >= 0)
> + return 0;
> +
> + nl_sock_neigh = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
> + if (nl_sock_neigh < 0)
Should we log a warn() message then? It might be relevant with LSMs
(AppArmor, SELinux, etc.).
> + return -1;
> +
> + if (bind(nl_sock_neigh, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
Same here.
> + close(nl_sock_neigh);
> + nl_sock_neigh = -1;
> + return -1;
> + }
> +
> + ref.fd = nl_sock_neigh;
Static Checkers Hate This One Trick:
netlink.c:1275:9: style: Redundant initialization for 'ref'. The initialized value is overwritten before it is read. [redundantInitialization]
ref.fd = nl_sock_neigh;
^
netlink.c:1251:22: note: ref is initialized
union epoll_ref ref = {
^
netlink.c:1275:9: note: ref is overwritten
ref.fd = nl_sock_neigh;
^
> + ev.data.u64 = ref.u64;
> + if (epoll_ctl(c->epollfd, EPOLL_CTL_ADD, nl_sock_neigh, &ev) == -1) {
> + close(nl_sock_neigh);
> + nl_sock_neigh = -1;
> + return -1;
> + }
> +
> + nl_neigh_subscr_sync(c, AF_INET, c->ifi4);
> + nl_neigh_subscr_sync(c, AF_INET6, c->ifi6);
> +
> + return 0;
> +}
> diff --git a/netlink.h b/netlink.h
> index b51e99c..a7d3506 100644
> --- a/netlink.h
> +++ b/netlink.h
> @@ -17,6 +17,8 @@ int nl_route_dup(int s_src, unsigned int ifi_src,
> int s_dst, unsigned int ifi_dst, sa_family_t af);
> int nl_addr_get(int s, unsigned int ifi, sa_family_t af,
> void *addr, int *prefix_len, void *addr_l);
> +bool nl_neigh_mac_get(int s, const union inany_addr *addr, int ifi,
> + unsigned char *mac);
> int nl_addr_set(int s, unsigned int ifi, sa_family_t af,
> const void *addr, int prefix_len);
> int nl_addr_get_ll(int s, unsigned int ifi, struct in6_addr *addr);
> @@ -28,5 +30,7 @@ int nl_link_set_mac(int s, unsigned int ifi, const void *mac);
> int nl_link_set_mtu(int s, unsigned int ifi, int mtu);
> int nl_link_set_flags(int s, unsigned int ifi,
> unsigned int set, unsigned int change);
> +int nl_neigh_subscr_init(struct ctx *c);
> +void nl_neigh_subscr_handler(struct ctx *c);
>
> #endif /* NETLINK_H */
> diff --git a/passt.c b/passt.c
> index 31fbb75..e20bbad 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -53,6 +53,7 @@
> #include "vu_common.h"
> #include "migrate.h"
> #include "repair.h"
> +#include "netlink.h"
>
> #define EPOLL_EVENTS 8
>
> @@ -79,6 +80,7 @@ char *epoll_type_str[] = {
> [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket",
> [EPOLL_TYPE_REPAIR_LISTEN] = "TCP_REPAIR helper listening socket",
> [EPOLL_TYPE_REPAIR] = "TCP_REPAIR helper socket",
> + [EPOLL_TYPE_NL_NEIGH] = "netlink neighbour subscription socket",
...or notification socket. I mean, you subscribe to events, but that
doesn't describe its main function.
> };
> static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES,
> "epoll_type_str[] doesn't match enum epoll_type");
> @@ -322,6 +324,9 @@ int main(int argc, char **argv)
>
> pcap_init(&c);
>
> + if (nl_neigh_subscr_init(&c) < 0)
> + warn("Failed to subscribe to RTMGRP_NEIGH");
> +
> if (!c.foreground) {
> if ((devnull_fd = open("/dev/null", O_RDWR | O_CLOEXEC)) < 0)
> die_perror("Failed to open /dev/null");
> @@ -414,6 +419,9 @@ loop:
> case EPOLL_TYPE_REPAIR:
> repair_handler(&c, eventmask);
> break;
> + case EPOLL_TYPE_NL_NEIGH:
> + nl_neigh_subscr_handler(&c);
> + break;
> default:
> /* Can't happen */
> ASSERT(0);
--
Stefano
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v11 2/9] fwd: Add cache table for ARP/NDP contents
2025-09-27 19:25 [PATCH v11 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
2025-09-27 19:25 ` [PATCH v11 1/9] netlink: add subsciption on changes in NDP/ARP table Jon Maloy
@ 2025-09-27 19:25 ` Jon Maloy
2025-09-29 5:56 ` David Gibson
2025-09-29 23:58 ` Stefano Brivio
2025-09-27 19:25 ` [PATCH v11 3/9] arp/ndp: send gratuitous ARP / unsolicitated NA when MAC cache entry added Jon Maloy
` (6 subsequent siblings)
8 siblings, 2 replies; 27+ messages in thread
From: Jon Maloy @ 2025-09-27 19:25 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
We add a cache table to keep track of the contents of the kernel ARP
and NDP tables. The table is fed from the just introduced netlink based
neigbour subscription function. The new table eliminates the need for
explicit netlink calls to find a host's MAC address.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v5: - Moved to earlier in series to reduce rebase conflicts
v6: - Sqashed the hash list commit and the FIFO/LRU queue commit
- Removed hash lookup. We now only use linear lookup in a
linked list
- Eliminated dynamic memory allocation.
- Ensured there is only one call to clock_gettime()
- Using MAC_ZERO instead of the previously dedicated definitions
v7: - NOW using MAC_ZERO where needed
- I am still using linear back-off for empty cache entries. Even
an incoming, flow-creating packet from a local host gives no
guarantee that its MAC address is in the ARP table, so we must
allow for a few new attempts at first possible occasions. Only
after several failed lookups can we conclude that we probably
never will succeed. Hence the back-off.
- Fixed a bug that David inadvertently made me aware of: I only
intended to set the initial expiry value to MAC_CACHE_RENEWAL
when an ARP/NDP table lookup was successful.
- Improved struct and function description comments.
v8: - Total re-design of table, adapting to the new, subscription
based way of updating it.
v9: - Catering for MAC address change for an existing host.
v10: - Changes according to feedback from David Gibson
---
conf.c | 1 +
fwd.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fwd.h | 7 +++
netlink.c | 2 +
4 files changed, 174 insertions(+)
diff --git a/conf.c b/conf.c
index 66b9e63..cfd8590 100644
--- a/conf.c
+++ b/conf.c
@@ -2133,6 +2133,7 @@ void conf(struct ctx *c, int argc, char **argv)
c->udp.fwd_out.mode = fwd_default;
fwd_scan_ports_init(c);
+ fwd_neigh_table_init();
if (!c->quiet)
conf_print(c);
diff --git a/fwd.c b/fwd.c
index 250cf56..2fd6cee 100644
--- a/fwd.c
+++ b/fwd.c
@@ -33,6 +33,170 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
#define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
+#define NEIGH_TABLE_SLOTS 1024 /* Must be power of two */
+#define NEIGH_TABLE_SIZE (NEIGH_TABLE_SLOTS / 2)
+
+/**
+ * neigh_table_entry - Entry in the ARP/NDP table
+ * @next: Next entry in slot or free list
+ * @addr: IP address of represented host
+ * @mac: MAC address of represented host
+ */
+struct neigh_table_entry {
+ struct neigh_table_entry *next;
+ union inany_addr addr;
+ uint8_t mac[ETH_ALEN];
+};
+
+/**
+ * neigh_table - Cache of ARP/NDP table contents
+ * @entries: Entries to be plugged into the hash slots when allocated
+ * @slots: Hash table slots
+ * @free: Linked list of unused entries
+ */
+struct neigh_table {
+ struct neigh_table_entry entries[NEIGH_TABLE_SIZE];
+ struct neigh_table_entry *slots[NEIGH_TABLE_SLOTS];
+ struct neigh_table_entry *free;
+};
+
+static struct neigh_table neigh_table;
+
+/**
+ * neigh_table_slot() - Hash key to a number within the table range
+ * @c: Execution context
+ * @key: The key to be used for the hash
+ *
+ * Return: The resulting hash value
+ */
+static inline size_t neigh_table_slot(const struct ctx *c,
+ const union inany_addr *key)
+{
+ struct siphash_state st = SIPHASH_INIT(c->hash_secret);
+ uint32_t i;
+
+ inany_siphash_feed(&st, key);
+ i = siphash_final(&st, sizeof(*key), 0);
+
+ return ((size_t)i) & (NEIGH_TABLE_SIZE - 1);
+}
+
+/**
+ * fwd_neigh_table_find() - Find a MAC table entry
+ * @c: Execution context
+ * @addr: Neighbour address to be used as key for the lookup
+ *
+ * Return: The found entry, if any. Otherwise NULL.
+ */
+static struct neigh_table_entry *fwd_neigh_table_find(const struct ctx *c,
+ const union inany_addr *addr)
+{
+ size_t slot = neigh_table_slot(c, addr);
+ struct neigh_table_entry *e = neigh_table.slots[slot];
+
+ while (e && !inany_equals(&e->addr, addr))
+ e = e->next;
+
+ return e;
+}
+
+/**
+ * fwd_neigh_table_update() - Allocate a neigbour table entry from the free list
+ * @c: Execution context
+ * @addr: Address used to determine insertion slot and store in entry
+ * @mac: The MAC address associated with the neighbour address
+ */
+void fwd_neigh_table_update(const struct ctx *c,
+ const union inany_addr *addr, uint8_t *mac)
+{
+ struct neigh_table *t = &neigh_table;
+ struct neigh_table_entry *e;
+ ssize_t slot;
+
+ /* MAC address might change sometimes */
+ e = fwd_neigh_table_find(c, addr);
+ if (e) {
+ memcpy(e->mac, mac, ETH_ALEN);
+ return;
+ }
+
+ e = t->free;
+ if (!e)
+ return;
+ t->free = e->next;
+
+ slot = neigh_table_slot(c, addr);
+ e->next = t->slots[slot];
+ t->slots[slot] = e;
+
+ memcpy(&e->addr, addr, sizeof(*addr));
+ memcpy(e->mac, mac, ETH_ALEN);
+}
+
+/**
+ * fwd_neigh_table_free() - Remove an entry from a slot and add it to free list
+ * @c: Execution context
+ * @addr: Neighbour address used to find the slot for the entry
+ */
+void fwd_neigh_table_free(const struct ctx *c, const union inany_addr *addr)
+{
+ ssize_t slot = neigh_table_slot(c, addr);
+ struct neigh_table *t = &neigh_table;
+ struct neigh_table_entry *e, **prev;
+
+ prev = &t->slots[slot];
+ e = t->slots[slot];
+ while (e && !inany_equals(&e->addr, addr)) {
+ prev = &e->next;
+ e = e->next;
+ }
+ if (!e)
+ return;
+
+ *prev = e->next;
+ e->next = t->free;
+ t->free = e;
+ memset(&e->addr, 0, sizeof(*addr));
+ memset(e->mac, 0, ETH_ALEN);
+}
+
+/**
+ * fwd_neigh_mac_get() - Lookup MAC address in the ARP/NDP table
+ * @c: Execution context
+ * @addr: Neighbour address used as lookup key
+ * @mac: Buffer for Ethernet MAC to return, found or default value.
+ *
+ * Return: true if real MAC found, false if not found or if failure
+ */
+bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ uint8_t *mac)
+{
+ struct neigh_table_entry *e = fwd_neigh_table_find(c, addr);
+
+ if (e)
+ memcpy(mac, e->mac, ETH_ALEN);
+ else
+ memcpy(mac, c->our_tap_mac, ETH_ALEN);
+
+ return !!e;
+}
+
+/**
+ * fwd_neigh_table_init() - Initialize the neighbor table
+ */
+void fwd_neigh_table_init(void)
+{
+ struct neigh_table *t = &neigh_table;
+ struct neigh_table_entry *e;
+
+ memset(t, 0, sizeof(*t));
+ for (int i = 0; i < NEIGH_TABLE_SIZE; i++) {
+ e = &t->entries[i];
+ e->next = t->free;
+ t->free = e;
+ }
+}
+
/** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
*
* Work out what ports the host thinks are emphemeral and record it for later
diff --git a/fwd.h b/fwd.h
index 65c7c96..5464349 100644
--- a/fwd.h
+++ b/fwd.h
@@ -56,5 +56,12 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
+void fwd_neigh_table_update(const struct ctx *c,
+ const union inany_addr *addr, uint8_t *mac);
+void fwd_neigh_table_free(const struct ctx *c,
+ const union inany_addr *addr);
+bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
+ uint8_t *mac);
+void fwd_neigh_table_init(void);
#endif /* FWD_H */
diff --git a/netlink.c b/netlink.c
index 0dc4e9d..da9e1d5 100644
--- a/netlink.c
+++ b/netlink.c
@@ -1189,8 +1189,10 @@ static void nl_neigh_msg_read(const struct ctx *c, struct nlmsghdr *nh)
if (nh->nlmsg_type == RTM_NEWNEIGH &&
ndm->ndm_state & NUD_VALID) {
trace("neigh table update: %s / %s", ip_str, mac_str);
+ fwd_neigh_table_update(c, &addr, mac);
} else {
trace("neigh table delete: %s / %s", ip_str, mac_str);
+ fwd_neigh_table_free(c, &addr);
}
}
--
2.50.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 2/9] fwd: Add cache table for ARP/NDP contents
2025-09-27 19:25 ` [PATCH v11 2/9] fwd: Add cache table for ARP/NDP contents Jon Maloy
@ 2025-09-29 5:56 ` David Gibson
2025-09-29 23:58 ` Stefano Brivio
1 sibling, 0 replies; 27+ messages in thread
From: David Gibson @ 2025-09-29 5:56 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 9503 bytes --]
On Sat, Sep 27, 2025 at 03:25:15PM -0400, Jon Maloy wrote:
> We add a cache table to keep track of the contents of the kernel ARP
> and NDP tables. The table is fed from the just introduced netlink based
> neigbour subscription function. The new table eliminates the need for
> explicit netlink calls to find a host's MAC address.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
A few comments below, but they're all pretty minor, so
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> ---
> v5: - Moved to earlier in series to reduce rebase conflicts
> v6: - Sqashed the hash list commit and the FIFO/LRU queue commit
> - Removed hash lookup. We now only use linear lookup in a
> linked list
> - Eliminated dynamic memory allocation.
> - Ensured there is only one call to clock_gettime()
> - Using MAC_ZERO instead of the previously dedicated definitions
> v7: - NOW using MAC_ZERO where needed
> - I am still using linear back-off for empty cache entries. Even
> an incoming, flow-creating packet from a local host gives no
> guarantee that its MAC address is in the ARP table, so we must
> allow for a few new attempts at first possible occasions. Only
> after several failed lookups can we conclude that we probably
> never will succeed. Hence the back-off.
> - Fixed a bug that David inadvertently made me aware of: I only
> intended to set the initial expiry value to MAC_CACHE_RENEWAL
> when an ARP/NDP table lookup was successful.
> - Improved struct and function description comments.
> v8: - Total re-design of table, adapting to the new, subscription
> based way of updating it.
> v9: - Catering for MAC address change for an existing host.
> v10: - Changes according to feedback from David Gibson
> ---
> conf.c | 1 +
> fwd.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fwd.h | 7 +++
> netlink.c | 2 +
> 4 files changed, 174 insertions(+)
>
> diff --git a/conf.c b/conf.c
> index 66b9e63..cfd8590 100644
> --- a/conf.c
> +++ b/conf.c
> @@ -2133,6 +2133,7 @@ void conf(struct ctx *c, int argc, char **argv)
> c->udp.fwd_out.mode = fwd_default;
>
> fwd_scan_ports_init(c);
> + fwd_neigh_table_init();
>
> if (!c->quiet)
> conf_print(c);
> diff --git a/fwd.c b/fwd.c
> index 250cf56..2fd6cee 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -33,6 +33,170 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
>
> #define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
>
> +#define NEIGH_TABLE_SLOTS 1024 /* Must be power of two */
A comment is better than nothing, but a static_assert() is better
still. IMO, not having the restriction at all is even better, since
it's pretty easy to do so in this case.
> +#define NEIGH_TABLE_SIZE (NEIGH_TABLE_SLOTS / 2)
> +
> +/**
> + * neigh_table_entry - Entry in the ARP/NDP table
> + * @next: Next entry in slot or free list
> + * @addr: IP address of represented host
> + * @mac: MAC address of represented host
> + */
> +struct neigh_table_entry {
> + struct neigh_table_entry *next;
> + union inany_addr addr;
> + uint8_t mac[ETH_ALEN];
> +};
> +
> +/**
> + * neigh_table - Cache of ARP/NDP table contents
> + * @entries: Entries to be plugged into the hash slots when allocated
> + * @slots: Hash table slots
> + * @free: Linked list of unused entries
> + */
> +struct neigh_table {
> + struct neigh_table_entry entries[NEIGH_TABLE_SIZE];
> + struct neigh_table_entry *slots[NEIGH_TABLE_SLOTS];
Given that we have bucket chains, I don't think it's particularly
required for this to be bigger than SIZE (unlike the flow table, where
linear probing means it's absolutely vital we have more hash buckets
than the maximum number of entries).
> + struct neigh_table_entry *free;
> +};
> +
> +static struct neigh_table neigh_table;
> +
> +/**
> + * neigh_table_slot() - Hash key to a number within the table range
> + * @c: Execution context
> + * @key: The key to be used for the hash
> + *
> + * Return: The resulting hash value
> + */
> +static inline size_t neigh_table_slot(const struct ctx *c,
> + const union inany_addr *key)
> +{
> + struct siphash_state st = SIPHASH_INIT(c->hash_secret);
> + uint32_t i;
> +
> + inany_siphash_feed(&st, key);
> + i = siphash_final(&st, sizeof(*key), 0);
> +
> + return ((size_t)i) & (NEIGH_TABLE_SIZE - 1);
> +}
> +
> +/**
> + * fwd_neigh_table_find() - Find a MAC table entry
> + * @c: Execution context
> + * @addr: Neighbour address to be used as key for the lookup
> + *
> + * Return: The found entry, if any. Otherwise NULL.
> + */
> +static struct neigh_table_entry *fwd_neigh_table_find(const struct ctx *c,
> + const union inany_addr *addr)
> +{
> + size_t slot = neigh_table_slot(c, addr);
> + struct neigh_table_entry *e = neigh_table.slots[slot];
> +
> + while (e && !inany_equals(&e->addr, addr))
> + e = e->next;
> +
> + return e;
> +}
> +
> +/**
> + * fwd_neigh_table_update() - Allocate a neigbour table entry from the free list
I like the change to the name, but now the one line description is
misleading.
> + * @c: Execution context
> + * @addr: Address used to determine insertion slot and store in entry
> + * @mac: The MAC address associated with the neighbour address
> + */
> +void fwd_neigh_table_update(const struct ctx *c,
> + const union inany_addr *addr, uint8_t *mac)
> +{
> + struct neigh_table *t = &neigh_table;
> + struct neigh_table_entry *e;
> + ssize_t slot;
> +
> + /* MAC address might change sometimes */
> + e = fwd_neigh_table_find(c, addr);
> + if (e) {
> + memcpy(e->mac, mac, ETH_ALEN);
> + return;
> + }
> +
> + e = t->free;
> + if (!e)
> + return;
> + t->free = e->next;
> +
> + slot = neigh_table_slot(c, addr);
> + e->next = t->slots[slot];
> + t->slots[slot] = e;
> +
> + memcpy(&e->addr, addr, sizeof(*addr));
> + memcpy(e->mac, mac, ETH_ALEN);
> +}
> +
> +/**
> + * fwd_neigh_table_free() - Remove an entry from a slot and add it to free list
> + * @c: Execution context
> + * @addr: Neighbour address used to find the slot for the entry
> + */
> +void fwd_neigh_table_free(const struct ctx *c, const union inany_addr *addr)
> +{
> + ssize_t slot = neigh_table_slot(c, addr);
> + struct neigh_table *t = &neigh_table;
> + struct neigh_table_entry *e, **prev;
> +
> + prev = &t->slots[slot];
> + e = t->slots[slot];
> + while (e && !inany_equals(&e->addr, addr)) {
> + prev = &e->next;
> + e = e->next;
> + }
> + if (!e)
> + return;
> +
> + *prev = e->next;
> + e->next = t->free;
> + t->free = e;
> + memset(&e->addr, 0, sizeof(*addr));
> + memset(e->mac, 0, ETH_ALEN);
> +}
> +
> +/**
> + * fwd_neigh_mac_get() - Lookup MAC address in the ARP/NDP table
> + * @c: Execution context
> + * @addr: Neighbour address used as lookup key
> + * @mac: Buffer for Ethernet MAC to return, found or default value.
> + *
> + * Return: true if real MAC found, false if not found or if failure
> + */
> +bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> + uint8_t *mac)
> +{
> + struct neigh_table_entry *e = fwd_neigh_table_find(c, addr);
> +
> + if (e)
> + memcpy(mac, e->mac, ETH_ALEN);
> + else
> + memcpy(mac, c->our_tap_mac, ETH_ALEN);
> +
> + return !!e;
> +}
> +
> +/**
> + * fwd_neigh_table_init() - Initialize the neighbor table
> + */
> +void fwd_neigh_table_init(void)
> +{
> + struct neigh_table *t = &neigh_table;
> + struct neigh_table_entry *e;
> +
> + memset(t, 0, sizeof(*t));
> + for (int i = 0; i < NEIGH_TABLE_SIZE; i++) {
> + e = &t->entries[i];
> + e->next = t->free;
> + t->free = e;
> + }
> +}
> +
> /** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
> *
> * Work out what ports the host thinks are emphemeral and record it for later
> diff --git a/fwd.h b/fwd.h
> index 65c7c96..5464349 100644
> --- a/fwd.h
> +++ b/fwd.h
> @@ -56,5 +56,12 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
> const struct flowside *ini, struct flowside *tgt);
> uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
> const struct flowside *ini, struct flowside *tgt);
> +void fwd_neigh_table_update(const struct ctx *c,
> + const union inany_addr *addr, uint8_t *mac);
> +void fwd_neigh_table_free(const struct ctx *c,
> + const union inany_addr *addr);
> +bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> + uint8_t *mac);
> +void fwd_neigh_table_init(void);
>
> #endif /* FWD_H */
> diff --git a/netlink.c b/netlink.c
> index 0dc4e9d..da9e1d5 100644
> --- a/netlink.c
> +++ b/netlink.c
> @@ -1189,8 +1189,10 @@ static void nl_neigh_msg_read(const struct ctx *c, struct nlmsghdr *nh)
> if (nh->nlmsg_type == RTM_NEWNEIGH &&
> ndm->ndm_state & NUD_VALID) {
> trace("neigh table update: %s / %s", ip_str, mac_str);
> + fwd_neigh_table_update(c, &addr, mac);
> } else {
> trace("neigh table delete: %s / %s", ip_str, mac_str);
> + fwd_neigh_table_free(c, &addr);
> }
> }
>
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 2/9] fwd: Add cache table for ARP/NDP contents
2025-09-27 19:25 ` [PATCH v11 2/9] fwd: Add cache table for ARP/NDP contents Jon Maloy
2025-09-29 5:56 ` David Gibson
@ 2025-09-29 23:58 ` Stefano Brivio
2025-09-30 0:52 ` David Gibson
1 sibling, 1 reply; 27+ messages in thread
From: Stefano Brivio @ 2025-09-29 23:58 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
Almost entirely nitpicks here:
On Sat, 27 Sep 2025 15:25:15 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> We add a cache table to keep track of the contents of the kernel ARP
> and NDP tables. The table is fed from the just introduced netlink based
> neigbour subscription function. The new table eliminates the need for
> explicit netlink calls to find a host's MAC address.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> ---
> v5: - Moved to earlier in series to reduce rebase conflicts
> v6: - Sqashed the hash list commit and the FIFO/LRU queue commit
> - Removed hash lookup. We now only use linear lookup in a
> linked list
> - Eliminated dynamic memory allocation.
> - Ensured there is only one call to clock_gettime()
> - Using MAC_ZERO instead of the previously dedicated definitions
> v7: - NOW using MAC_ZERO where needed
> - I am still using linear back-off for empty cache entries. Even
> an incoming, flow-creating packet from a local host gives no
> guarantee that its MAC address is in the ARP table, so we must
> allow for a few new attempts at first possible occasions. Only
> after several failed lookups can we conclude that we probably
> never will succeed. Hence the back-off.
> - Fixed a bug that David inadvertently made me aware of: I only
> intended to set the initial expiry value to MAC_CACHE_RENEWAL
> when an ARP/NDP table lookup was successful.
> - Improved struct and function description comments.
> v8: - Total re-design of table, adapting to the new, subscription
> based way of updating it.
> v9: - Catering for MAC address change for an existing host.
> v10: - Changes according to feedback from David Gibson
> ---
> conf.c | 1 +
> fwd.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> fwd.h | 7 +++
> netlink.c | 2 +
> 4 files changed, 174 insertions(+)
>
> diff --git a/conf.c b/conf.c
> index 66b9e63..cfd8590 100644
> --- a/conf.c
> +++ b/conf.c
> @@ -2133,6 +2133,7 @@ void conf(struct ctx *c, int argc, char **argv)
> c->udp.fwd_out.mode = fwd_default;
>
> fwd_scan_ports_init(c);
> + fwd_neigh_table_init();
>
> if (!c->quiet)
> conf_print(c);
> diff --git a/fwd.c b/fwd.c
> index 250cf56..2fd6cee 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -33,6 +33,170 @@ static in_port_t fwd_ephemeral_max = NUM_PORTS - 1;
>
> #define PORT_RANGE_SYSCTL "/proc/sys/net/ipv4/ip_local_port_range"
>
> +#define NEIGH_TABLE_SLOTS 1024 /* Must be power of two */
> +#define NEIGH_TABLE_SIZE (NEIGH_TABLE_SLOTS / 2)
> +
> +/**
> + * neigh_table_entry - Entry in the ARP/NDP table
It's a struct (it could be a union), so kerneldoc says:
* struct neigh_table_entry - ...
> + * @next: Next entry in slot or free list
> + * @addr: IP address of represented host
> + * @mac: MAC address of represented host
> + */
> +struct neigh_table_entry {
> + struct neigh_table_entry *next;
> + union inany_addr addr;
> + uint8_t mac[ETH_ALEN];
> +};
> +
> +/**
> + * neigh_table - Cache of ARP/NDP table contents
Same as above.
> + * @entries: Entries to be plugged into the hash slots when allocated
> + * @slots: Hash table slots
> + * @free: Linked list of unused entries
> + */
> +struct neigh_table {
> + struct neigh_table_entry entries[NEIGH_TABLE_SIZE];
> + struct neigh_table_entry *slots[NEIGH_TABLE_SLOTS];
> + struct neigh_table_entry *free;
> +};
> +
> +static struct neigh_table neigh_table;
> +
> +/**
> + * neigh_table_slot() - Hash key to a number within the table range
> + * @c: Execution context
> + * @key: The key to be used for the hash
> + *
> + * Return: The resulting hash value
> + */
> +static inline size_t neigh_table_slot(const struct ctx *c,
See:
https://www.kernel.org/doc/html/latest/process/coding-style.html#the-inline-disease
> + const union inany_addr *key)
> +{
> + struct siphash_state st = SIPHASH_INIT(c->hash_secret);
> + uint32_t i;
> +
> + inany_siphash_feed(&st, key);
> + i = siphash_final(&st, sizeof(*key), 0);
> +
> + return ((size_t)i) & (NEIGH_TABLE_SIZE - 1);
> +}
> +
> +/**
> + * fwd_neigh_table_find() - Find a MAC table entry
Strictly speaking, it's a MAC address table -- MAC refers to the access
control itself.
> + * @c: Execution context
> + * @addr: Neighbour address to be used as key for the lookup
> + *
> + * Return: The found entry, if any. Otherwise NULL.
In the spirit of the closing lines of:
https://archives.passt.top/passt-dev/20250915081319.00e72e53@elisabeth/
I'd suggest that in Germanic languages verb participles don't always
behave like adjectives, so you can say "The entry [that was] found",
but "The found entry" is rather awkward. Or go with "matching entry",
gerunds are forgiving.
> + */
> +static struct neigh_table_entry *fwd_neigh_table_find(const struct ctx *c,
> + const union inany_addr *addr)
> +{
> + size_t slot = neigh_table_slot(c, addr);
> + struct neigh_table_entry *e = neigh_table.slots[slot];
> +
> + while (e && !inany_equals(&e->addr, addr))
> + e = e->next;
> +
> + return e;
> +}
> +
> +/**
> + * fwd_neigh_table_update() - Allocate a neigbour table entry from the free list
neighbour
> + * @c: Execution context
> + * @addr: Address used to determine insertion slot and store in entry
> + * @mac: The MAC address associated with the neighbour address
> + */
> +void fwd_neigh_table_update(const struct ctx *c,
> + const union inany_addr *addr, uint8_t *mac)
Cppcheck suggests that:
fwd.c:112:47: style: Parameter 'mac' can be declared as pointer to const [constParameterPointer]
const union inany_addr *addr, uint8_t *mac)
^
> +{
> + struct neigh_table *t = &neigh_table;
> + struct neigh_table_entry *e;
> + ssize_t slot;
> +
> + /* MAC address might change sometimes */
> + e = fwd_neigh_table_find(c, addr);
> + if (e) {
> + memcpy(e->mac, mac, ETH_ALEN);
> + return;
> + }
> +
> + e = t->free;
> + if (!e)
> + return;
Log a debug() or trace() message here? We might be staring at something
like this for a while otherwise.
> + t->free = e->next;
> +
> + slot = neigh_table_slot(c, addr);
> + e->next = t->slots[slot];
> + t->slots[slot] = e;
> +
> + memcpy(&e->addr, addr, sizeof(*addr));
> + memcpy(e->mac, mac, ETH_ALEN);
> +}
> +
> +/**
> + * fwd_neigh_table_free() - Remove an entry from a slot and add it to free list
> + * @c: Execution context
> + * @addr: Neighbour address used to find the slot for the entry
The neighbour address can be a link-layer / MAC address, or an IP
address.
Perhaps go with "IP address" here, so that it's clear that we free by
IP address, and not by MAC address?
> + */
> +void fwd_neigh_table_free(const struct ctx *c, const union inany_addr *addr)
> +{
> + ssize_t slot = neigh_table_slot(c, addr);
> + struct neigh_table *t = &neigh_table;
> + struct neigh_table_entry *e, **prev;
> +
> + prev = &t->slots[slot];
> + e = t->slots[slot];
> + while (e && !inany_equals(&e->addr, addr)) {
> + prev = &e->next;
> + e = e->next;
> + }
> + if (!e)
> + return;
> +
> + *prev = e->next;
> + e->next = t->free;
> + t->free = e;
> + memset(&e->addr, 0, sizeof(*addr));
> + memset(e->mac, 0, ETH_ALEN);
Do we care about zeroing them? If we do because we might find those
entries, note that both all-zero MAC address and IP addresses are
valid.
If we can't find those entries linked anywhere, we don't necessarily
care about zeroing them. If it makes you feel better, though, I'd say
keep that.
> +}
> +
> +/**
> + * fwd_neigh_mac_get() - Lookup MAC address in the ARP/NDP table
Look up (the verb, not the noun)
> + * @c: Execution context
> + * @addr: Neighbour address used as lookup key
> + * @mac: Buffer for Ethernet MAC to return, found or default value.
> + *
> + * Return: true if real MAC found, false if not found or if failure
on failure
...by the way, you're ignoring this return value in the rest of the
series. Is it a left-over?
> + */
> +bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> + uint8_t *mac)
> +{
> + struct neigh_table_entry *e = fwd_neigh_table_find(c, addr);
My pedantic friend reports:
fwd.c:182:28: style: Variable 'e' can be declared as pointer to const [constVariablePointer]
struct neigh_table_entry *e = fwd_neigh_table_find(c, addr);
^
> +
> + if (e)
> + memcpy(mac, e->mac, ETH_ALEN);
> + else
> + memcpy(mac, c->our_tap_mac, ETH_ALEN);
> +
> + return !!e;
> +}
> +
> +/**
> + * fwd_neigh_table_init() - Initialize the neighbor table
neighbour, for consistency, not correctness.
> + */
> +void fwd_neigh_table_init(void)
> +{
> + struct neigh_table *t = &neigh_table;
> + struct neigh_table_entry *e;
> +
> + memset(t, 0, sizeof(*t));
> + for (int i = 0; i < NEIGH_TABLE_SIZE; i++) {
For consistency: we never declare variables in loop headers.
> + e = &t->entries[i];
> + e->next = t->free;
> + t->free = e;
This, by the way, has the practical effect of making the kernel
allocate memory for the full table, even if it's otherwise unused,
because of those pointers.
It's ~24 kB so I don't really care, and I can't think of a much better
way to organise it, either.
> + }
> +}
> +
> /** fwd_probe_ephemeral() - Determine what ports this host considers ephemeral
> *
> * Work out what ports the host thinks are emphemeral and record it for later
> diff --git a/fwd.h b/fwd.h
> index 65c7c96..5464349 100644
> --- a/fwd.h
> +++ b/fwd.h
> @@ -56,5 +56,12 @@ uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
> const struct flowside *ini, struct flowside *tgt);
> uint8_t fwd_nat_from_host(const struct ctx *c, uint8_t proto,
> const struct flowside *ini, struct flowside *tgt);
> +void fwd_neigh_table_update(const struct ctx *c,
> + const union inany_addr *addr, uint8_t *mac);
> +void fwd_neigh_table_free(const struct ctx *c,
> + const union inany_addr *addr);
> +bool fwd_neigh_mac_get(const struct ctx *c, const union inany_addr *addr,
> + uint8_t *mac);
> +void fwd_neigh_table_init(void);
>
> #endif /* FWD_H */
> diff --git a/netlink.c b/netlink.c
> index 0dc4e9d..da9e1d5 100644
> --- a/netlink.c
> +++ b/netlink.c
> @@ -1189,8 +1189,10 @@ static void nl_neigh_msg_read(const struct ctx *c, struct nlmsghdr *nh)
> if (nh->nlmsg_type == RTM_NEWNEIGH &&
> ndm->ndm_state & NUD_VALID) {
> trace("neigh table update: %s / %s", ip_str, mac_str);
> + fwd_neigh_table_update(c, &addr, mac);
> } else {
> trace("neigh table delete: %s / %s", ip_str, mac_str);
> + fwd_neigh_table_free(c, &addr);
> }
> }
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 2/9] fwd: Add cache table for ARP/NDP contents
2025-09-29 23:58 ` Stefano Brivio
@ 2025-09-30 0:52 ` David Gibson
0 siblings, 0 replies; 27+ messages in thread
From: David Gibson @ 2025-09-30 0:52 UTC (permalink / raw)
To: Stefano Brivio; +Cc: Jon Maloy, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 2464 bytes --]
On Tue, Sep 30, 2025 at 01:58:42AM +0200, Stefano Brivio wrote:
> Almost entirely nitpicks here:
>
> On Sat, 27 Sep 2025 15:25:15 -0400
> Jon Maloy <jmaloy@redhat.com> wrote:
>
> > We add a cache table to keep track of the contents of the kernel ARP
> > and NDP tables. The table is fed from the just introduced netlink based
> > neigbour subscription function. The new table eliminates the need for
> > explicit netlink calls to find a host's MAC address.
> >
> > Signed-off-by: Jon Maloy <jmaloy@redhat.com>
[snip]
> > + const union inany_addr *key)
> > +{
> > + struct siphash_state st = SIPHASH_INIT(c->hash_secret);
> > + uint32_t i;
> > +
> > + inany_siphash_feed(&st, key);
> > + i = siphash_final(&st, sizeof(*key), 0);
> > +
> > + return ((size_t)i) & (NEIGH_TABLE_SIZE - 1);
> > +}
> > +
> > +/**
> > + * fwd_neigh_table_find() - Find a MAC table entry
>
> Strictly speaking, it's a MAC address table -- MAC refers to the access
> control itself.
I mean, it's both. The entries are our neighbours and the contents of
each entry is the MAC address.
[snip]
> > +void fwd_neigh_table_free(const struct ctx *c, const union inany_addr *addr)
> > +{
> > + ssize_t slot = neigh_table_slot(c, addr);
> > + struct neigh_table *t = &neigh_table;
> > + struct neigh_table_entry *e, **prev;
> > +
> > + prev = &t->slots[slot];
> > + e = t->slots[slot];
> > + while (e && !inany_equals(&e->addr, addr)) {
> > + prev = &e->next;
> > + e = e->next;
> > + }
> > + if (!e)
> > + return;
> > +
> > + *prev = e->next;
> > + e->next = t->free;
> > + t->free = e;
> > + memset(&e->addr, 0, sizeof(*addr));
> > + memset(e->mac, 0, ETH_ALEN);
>
> Do we care about zeroing them? If we do because we might find those
> entries, note that both all-zero MAC address and IP addresses are
> valid.
In the case of IP addresses, while all-zero is valid in certain
contexts, I'd argue it's never a valid address for a neighbour ("this
host on this network" is frustratingly vague, but pretty clearly not a
neighbour). 255.255.255.255 would also make a reasonable placeholder
if we must have one.
For MAC addresses, ff:ff:ff:ff:ff:ff would probably make a better
placeholder, if we must have one.
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v11 3/9] arp/ndp: send gratuitous ARP / unsolicitated NA when MAC cache entry added
2025-09-27 19:25 [PATCH v11 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
2025-09-27 19:25 ` [PATCH v11 1/9] netlink: add subsciption on changes in NDP/ARP table Jon Maloy
2025-09-27 19:25 ` [PATCH v11 2/9] fwd: Add cache table for ARP/NDP contents Jon Maloy
@ 2025-09-27 19:25 ` Jon Maloy
2025-09-29 6:03 ` David Gibson
2025-09-29 23:58 ` Stefano Brivio
2025-09-27 19:25 ` [PATCH v11 4/9] arp/ndp: respond with true MAC address of LAN local remote hosts Jon Maloy
` (5 subsequent siblings)
8 siblings, 2 replies; 27+ messages in thread
From: Jon Maloy @ 2025-09-27 19:25 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
Gratuitious ARP and unsolicitated NA should be handled with caution
because of the risk of malignant users emitting them to disturb
network communication.
There is however one case we where we know it is legitimate
and safe for us to send out such messages: The one time we switch
from using ctx->own_tap_mac to a MAC address received via the
recently added neigbour subscription function. Later changes to
the MAC address of a host in an existing entry cannot be fully
trusted, so we abstain from doing it in such cases.
When sending this type of messages, we notice that the guest accepts
the update, but also asks for a confirmation in the form of a regular
ARP/NS request. This is responded to with the new value, and we have
exactly the effect we wanted.
This commit adds this functionality.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
---
v10: -Made small changes based of feedback from David G.
v11: -Moved from 'Gratuitous ARP reply' model to 'ARP Announcement'
model.
---
arp.c | 41 +++++++++++++++++++++++++++++++++++++++++
arp.h | 2 ++
fwd.c | 8 ++++++++
ndp.c | 10 ++++++++++
ndp.h | 1 +
5 files changed, 62 insertions(+)
diff --git a/arp.c b/arp.c
index ad088b1..57e7b59 100644
--- a/arp.c
+++ b/arp.c
@@ -146,3 +146,44 @@ void arp_send_init_req(const struct ctx *c)
debug("Sending initial ARP request for guest MAC address");
tap_send_single(c, &req, sizeof(req));
}
+
+/**
+ * arp_send_gratuitous() - Send a gratuitous ARP announcement for an IPv4 host
+ * @c: Execution context
+ * @ip: IPv4 address we announce as owned by @mac
+ * @mac: MAC address to advertise for @ip
+ */
+void arp_send_gratuitous(const struct ctx *c, struct in_addr *ip,
+ const unsigned char *mac)
+{
+ char ip_str[INET_ADDRSTRLEN];
+ struct {
+ struct ethhdr eh;
+ struct arphdr ah;
+ struct arpmsg am;
+ } __attribute__((__packed__)) annc;
+
+ /* Ethernet header */
+ annc.eh.h_proto = htons(ETH_P_ARP);
+ memcpy(annc.eh.h_dest, MAC_BROADCAST, sizeof(annc.eh.h_dest));
+ memcpy(annc.eh.h_source, mac, sizeof(annc.eh.h_source));
+
+ /* ARP header */
+ annc.ah.ar_op = htons(ARPOP_REQUEST);
+ annc.ah.ar_hrd = htons(ARPHRD_ETHER);
+ annc.ah.ar_pro = htons(ETH_P_IP);
+ annc.ah.ar_hln = ETH_ALEN;
+ annc.ah.ar_pln = 4;
+
+ /* ARP message */
+ memcpy(annc.am.sha, mac, sizeof(annc.am.sha));
+ memcpy(annc.am.sip, ip, sizeof(annc.am.sip));
+ memcpy(annc.am.tha, MAC_BROADCAST, sizeof(annc.am.tha));
+ memcpy(annc.am.tip, ip, sizeof(annc.am.tip));
+
+ inet_ntop(AF_INET, ip, ip_str, sizeof(ip_str));
+ debug("Sending ARP announcement for %s", ip_str);
+
+ tap_send_single(c, &annc, sizeof(annc));
+}
+
diff --git a/arp.h b/arp.h
index d5ad0e1..2cf1326 100644
--- a/arp.h
+++ b/arp.h
@@ -22,5 +22,7 @@ struct arpmsg {
int arp(const struct ctx *c, struct iov_tail *data);
void arp_send_init_req(const struct ctx *c);
+void arp_send_gratuitous(const struct ctx *c, struct in_addr *ip,
+ const unsigned char *mac);
#endif /* ARP_H */
diff --git a/fwd.c b/fwd.c
index 2fd6cee..7f38b40 100644
--- a/fwd.c
+++ b/fwd.c
@@ -26,6 +26,8 @@
#include "passt.h"
#include "lineread.h"
#include "flow_table.h"
+#include "arp.h"
+#include "ndp.h"
/* Empheral port range: values from RFC 6335 */
static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
@@ -131,6 +133,12 @@ void fwd_neigh_table_update(const struct ctx *c,
memcpy(&e->addr, addr, sizeof(*addr));
memcpy(e->mac, mac, ETH_ALEN);
+
+ /* Send gratuitous ARP / unsolicited NA for the new mapping */
+ if (inany_v4(addr))
+ arp_send_gratuitous(c, inany_v4(addr), e->mac);
+ else
+ ndp_send_unsolicited_na(c, &addr->a6);
}
/**
diff --git a/ndp.c b/ndp.c
index 588b48f..d7f64a3 100644
--- a/ndp.c
+++ b/ndp.c
@@ -218,6 +218,16 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
ndp_send(c, dst, &na, sizeof(na));
}
+/**
+ * ndp_send_unsolicited_na() - Send unsolicited NA
+ * @c: Execution context
+ * @addr: IPv6 address to advertise
+ */
+void ndp_send_unsolicited_na(const struct ctx *c, const struct in6_addr *addr)
+{
+ ndp_na(c, &in6addr_ll_all_nodes, addr);
+}
+
/**
* ndp_ra() - Send an NDP Router Advertisement (RA) message
* @c: Execution context
diff --git a/ndp.h b/ndp.h
index 781ea86..320009c 100644
--- a/ndp.h
+++ b/ndp.h
@@ -12,5 +12,6 @@ int ndp(const struct ctx *c, const struct in6_addr *saddr,
struct iov_tail *data);
void ndp_timer(const struct ctx *c, const struct timespec *now);
void ndp_send_init_req(const struct ctx *c);
+void ndp_send_unsolicited_na(const struct ctx *c, const struct in6_addr *addr);
#endif /* NDP_H */
--
2.50.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 3/9] arp/ndp: send gratuitous ARP / unsolicitated NA when MAC cache entry added
2025-09-27 19:25 ` [PATCH v11 3/9] arp/ndp: send gratuitous ARP / unsolicitated NA when MAC cache entry added Jon Maloy
@ 2025-09-29 6:03 ` David Gibson
2025-09-29 23:58 ` Stefano Brivio
1 sibling, 0 replies; 27+ messages in thread
From: David Gibson @ 2025-09-29 6:03 UTC (permalink / raw)
To: Jon Maloy; +Cc: sbrivio, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 5686 bytes --]
On Sat, Sep 27, 2025 at 03:25:16PM -0400, Jon Maloy wrote:
> Gratuitious ARP and unsolicitated NA should be handled with caution
s/unsolicitated/unsolicited/
Given the confusing history that RFC 5227 mentions, I feel like we
should avoid the term "gratuitous ARP" as confusing.
> because of the risk of malignant users emitting them to disturb
> network communication.
>
> There is however one case we where we know it is legitimate
> and safe for us to send out such messages: The one time we switch
> from using ctx->own_tap_mac to a MAC address received via the
> recently added neigbour subscription function. Later changes to
> the MAC address of a host in an existing entry cannot be fully
> trusted, so we abstain from doing it in such cases.
>
> When sending this type of messages, we notice that the guest accepts
> the update, but also asks for a confirmation in the form of a regular
> ARP/NS request.
Is this still true with the announcement-by-request method?
> This is responded to with the new value, and we have
> exactly the effect we wanted.
>
> This commit adds this functionality.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> ---
> v10: -Made small changes based of feedback from David G.
> v11: -Moved from 'Gratuitous ARP reply' model to 'ARP Announcement'
> model.
> ---
> arp.c | 41 +++++++++++++++++++++++++++++++++++++++++
> arp.h | 2 ++
> fwd.c | 8 ++++++++
> ndp.c | 10 ++++++++++
> ndp.h | 1 +
> 5 files changed, 62 insertions(+)
>
> diff --git a/arp.c b/arp.c
> index ad088b1..57e7b59 100644
> --- a/arp.c
> +++ b/arp.c
> @@ -146,3 +146,44 @@ void arp_send_init_req(const struct ctx *c)
> debug("Sending initial ARP request for guest MAC address");
> tap_send_single(c, &req, sizeof(req));
> }
> +
> +/**
> + * arp_send_gratuitous() - Send a gratuitous ARP announcement for an IPv4 host
arp_announce() maybe?
> + * @c: Execution context
> + * @ip: IPv4 address we announce as owned by @mac
> + * @mac: MAC address to advertise for @ip
> + */
> +void arp_send_gratuitous(const struct ctx *c, struct in_addr *ip,
> + const unsigned char *mac)
> +{
> + char ip_str[INET_ADDRSTRLEN];
> + struct {
> + struct ethhdr eh;
> + struct arphdr ah;
> + struct arpmsg am;
> + } __attribute__((__packed__)) annc;
> +
> + /* Ethernet header */
> + annc.eh.h_proto = htons(ETH_P_ARP);
> + memcpy(annc.eh.h_dest, MAC_BROADCAST, sizeof(annc.eh.h_dest));
> + memcpy(annc.eh.h_source, mac, sizeof(annc.eh.h_source));
> +
> + /* ARP header */
> + annc.ah.ar_op = htons(ARPOP_REQUEST);
> + annc.ah.ar_hrd = htons(ARPHRD_ETHER);
> + annc.ah.ar_pro = htons(ETH_P_IP);
> + annc.ah.ar_hln = ETH_ALEN;
> + annc.ah.ar_pln = 4;
> +
> + /* ARP message */
> + memcpy(annc.am.sha, mac, sizeof(annc.am.sha));
> + memcpy(annc.am.sip, ip, sizeof(annc.am.sip));
> + memcpy(annc.am.tha, MAC_BROADCAST, sizeof(annc.am.tha));
> + memcpy(annc.am.tip, ip, sizeof(annc.am.tip));
Does using tip == sip make sense? Or should tip be 255.255.255.255?
> + inet_ntop(AF_INET, ip, ip_str, sizeof(ip_str));
> + debug("Sending ARP announcement for %s", ip_str);
Maybe include the MAC address too?
> +
> + tap_send_single(c, &annc, sizeof(annc));
> +}
> +
> diff --git a/arp.h b/arp.h
> index d5ad0e1..2cf1326 100644
> --- a/arp.h
> +++ b/arp.h
> @@ -22,5 +22,7 @@ struct arpmsg {
>
> int arp(const struct ctx *c, struct iov_tail *data);
> void arp_send_init_req(const struct ctx *c);
> +void arp_send_gratuitous(const struct ctx *c, struct in_addr *ip,
> + const unsigned char *mac);
>
> #endif /* ARP_H */
> diff --git a/fwd.c b/fwd.c
> index 2fd6cee..7f38b40 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -26,6 +26,8 @@
> #include "passt.h"
> #include "lineread.h"
> #include "flow_table.h"
> +#include "arp.h"
> +#include "ndp.h"
>
> /* Empheral port range: values from RFC 6335 */
> static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
> @@ -131,6 +133,12 @@ void fwd_neigh_table_update(const struct ctx *c,
>
> memcpy(&e->addr, addr, sizeof(*addr));
> memcpy(e->mac, mac, ETH_ALEN);
> +
> + /* Send gratuitous ARP / unsolicited NA for the new mapping */
> + if (inany_v4(addr))
> + arp_send_gratuitous(c, inany_v4(addr), e->mac);
> + else
> + ndp_send_unsolicited_na(c, &addr->a6);
> }
>
> /**
> diff --git a/ndp.c b/ndp.c
> index 588b48f..d7f64a3 100644
> --- a/ndp.c
> +++ b/ndp.c
> @@ -218,6 +218,16 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
> ndp_send(c, dst, &na, sizeof(na));
> }
>
> +/**
> + * ndp_send_unsolicited_na() - Send unsolicited NA
> + * @c: Execution context
> + * @addr: IPv6 address to advertise
> + */
> +void ndp_send_unsolicited_na(const struct ctx *c, const struct in6_addr *addr)
> +{
> + ndp_na(c, &in6addr_ll_all_nodes, addr);
> +}
> +
> /**
> * ndp_ra() - Send an NDP Router Advertisement (RA) message
> * @c: Execution context
> diff --git a/ndp.h b/ndp.h
> index 781ea86..320009c 100644
> --- a/ndp.h
> +++ b/ndp.h
> @@ -12,5 +12,6 @@ int ndp(const struct ctx *c, const struct in6_addr *saddr,
> struct iov_tail *data);
> void ndp_timer(const struct ctx *c, const struct timespec *now);
> void ndp_send_init_req(const struct ctx *c);
> +void ndp_send_unsolicited_na(const struct ctx *c, const struct in6_addr *addr);
>
> #endif /* NDP_H */
> --
> 2.50.1
>
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 3/9] arp/ndp: send gratuitous ARP / unsolicitated NA when MAC cache entry added
2025-09-27 19:25 ` [PATCH v11 3/9] arp/ndp: send gratuitous ARP / unsolicitated NA when MAC cache entry added Jon Maloy
2025-09-29 6:03 ` David Gibson
@ 2025-09-29 23:58 ` Stefano Brivio
2025-09-30 0:56 ` David Gibson
1 sibling, 1 reply; 27+ messages in thread
From: Stefano Brivio @ 2025-09-29 23:58 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
On Sat, 27 Sep 2025 15:25:16 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> Gratuitious ARP and unsolicitated NA should be handled with caution
> because of the risk of malignant users emitting them to disturb
> network communication.
>
> There is however one case we where we know it is legitimate
> and safe for us to send out such messages: The one time we switch
> from using ctx->own_tap_mac to a MAC address received via the
> recently added neigbour subscription function. Later changes to
> the MAC address of a host in an existing entry cannot be fully
> trusted, so we abstain from doing it in such cases.
>
> When sending this type of messages, we notice that the guest accepts
> the update, but also asks for a confirmation in the form of a regular
> ARP/NS request. This is responded to with the new value, and we have
> exactly the effect we wanted.
>
> This commit adds this functionality.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
>
> ---
> v10: -Made small changes based of feedback from David G.
> v11: -Moved from 'Gratuitous ARP reply' model to 'ARP Announcement'
> model.
> ---
> arp.c | 41 +++++++++++++++++++++++++++++++++++++++++
> arp.h | 2 ++
> fwd.c | 8 ++++++++
> ndp.c | 10 ++++++++++
> ndp.h | 1 +
> 5 files changed, 62 insertions(+)
>
> diff --git a/arp.c b/arp.c
> index ad088b1..57e7b59 100644
> --- a/arp.c
> +++ b/arp.c
> @@ -146,3 +146,44 @@ void arp_send_init_req(const struct ctx *c)
> debug("Sending initial ARP request for guest MAC address");
> tap_send_single(c, &req, sizeof(req));
> }
> +
> +/**
> + * arp_send_gratuitous() - Send a gratuitous ARP announcement for an IPv4 host
> + * @c: Execution context
> + * @ip: IPv4 address we announce as owned by @mac
> + * @mac: MAC address to advertise for @ip
> + */
> +void arp_send_gratuitous(const struct ctx *c, struct in_addr *ip,
> + const unsigned char *mac)
> +{
> + char ip_str[INET_ADDRSTRLEN];
> + struct {
> + struct ethhdr eh;
> + struct arphdr ah;
> + struct arpmsg am;
> + } __attribute__((__packed__)) annc;
> +
> + /* Ethernet header */
> + annc.eh.h_proto = htons(ETH_P_ARP);
> + memcpy(annc.eh.h_dest, MAC_BROADCAST, sizeof(annc.eh.h_dest));
> + memcpy(annc.eh.h_source, mac, sizeof(annc.eh.h_source));
> +
> + /* ARP header */
> + annc.ah.ar_op = htons(ARPOP_REQUEST);
> + annc.ah.ar_hrd = htons(ARPHRD_ETHER);
> + annc.ah.ar_pro = htons(ETH_P_IP);
> + annc.ah.ar_hln = ETH_ALEN;
> + annc.ah.ar_pln = 4;
> +
> + /* ARP message */
> + memcpy(annc.am.sha, mac, sizeof(annc.am.sha));
> + memcpy(annc.am.sip, ip, sizeof(annc.am.sip));
> + memcpy(annc.am.tha, MAC_BROADCAST, sizeof(annc.am.tha));
> + memcpy(annc.am.tip, ip, sizeof(annc.am.tip));
> +
> + inet_ntop(AF_INET, ip, ip_str, sizeof(ip_str));
> + debug("Sending ARP announcement for %s", ip_str);
> +
> + tap_send_single(c, &annc, sizeof(annc));
> +}
> +
Nit: git show / log highlights this excess newline in red, and I merely
relay the pedantry.
> diff --git a/arp.h b/arp.h
> index d5ad0e1..2cf1326 100644
> --- a/arp.h
> +++ b/arp.h
> @@ -22,5 +22,7 @@ struct arpmsg {
>
> int arp(const struct ctx *c, struct iov_tail *data);
> void arp_send_init_req(const struct ctx *c);
> +void arp_send_gratuitous(const struct ctx *c, struct in_addr *ip,
> + const unsigned char *mac);
>
> #endif /* ARP_H */
> diff --git a/fwd.c b/fwd.c
> index 2fd6cee..7f38b40 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -26,6 +26,8 @@
> #include "passt.h"
> #include "lineread.h"
> #include "flow_table.h"
> +#include "arp.h"
> +#include "ndp.h"
>
> /* Empheral port range: values from RFC 6335 */
> static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
> @@ -131,6 +133,12 @@ void fwd_neigh_table_update(const struct ctx *c,
>
> memcpy(&e->addr, addr, sizeof(*addr));
> memcpy(e->mac, mac, ETH_ALEN);
> +
> + /* Send gratuitous ARP / unsolicited NA for the new mapping */
> + if (inany_v4(addr))
> + arp_send_gratuitous(c, inany_v4(addr), e->mac);
> + else
> + ndp_send_unsolicited_na(c, &addr->a6);
> }
>
> /**
> diff --git a/ndp.c b/ndp.c
> index 588b48f..d7f64a3 100644
> --- a/ndp.c
> +++ b/ndp.c
> @@ -218,6 +218,16 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
> ndp_send(c, dst, &na, sizeof(na));
> }
>
> +/**
> + * ndp_send_unsolicited_na() - Send unsolicited NA
This comment, strictly speaking, should be related to the notification
mechanism, but the usage here makes me wonder: do we want to announce
link-layer addresses for link-local (IPv6) addresses as well?
Those are not reachable from the guest. I'm not sure if we want to
filter IPv6 addresses a bit, in general. I didn't really think it
through.
> + * @c: Execution context
> + * @addr: IPv6 address to advertise
> + */
> +void ndp_send_unsolicited_na(const struct ctx *c, const struct in6_addr *addr)
> +{
> + ndp_na(c, &in6addr_ll_all_nodes, addr);
> +}
> +
> /**
> * ndp_ra() - Send an NDP Router Advertisement (RA) message
> * @c: Execution context
> diff --git a/ndp.h b/ndp.h
> index 781ea86..320009c 100644
> --- a/ndp.h
> +++ b/ndp.h
> @@ -12,5 +12,6 @@ int ndp(const struct ctx *c, const struct in6_addr *saddr,
> struct iov_tail *data);
> void ndp_timer(const struct ctx *c, const struct timespec *now);
> void ndp_send_init_req(const struct ctx *c);
> +void ndp_send_unsolicited_na(const struct ctx *c, const struct in6_addr *addr);
>
> #endif /* NDP_H */
--
Stefano
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 3/9] arp/ndp: send gratuitous ARP / unsolicitated NA when MAC cache entry added
2025-09-29 23:58 ` Stefano Brivio
@ 2025-09-30 0:56 ` David Gibson
0 siblings, 0 replies; 27+ messages in thread
From: David Gibson @ 2025-09-30 0:56 UTC (permalink / raw)
To: Stefano Brivio; +Cc: Jon Maloy, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 5674 bytes --]
On Tue, Sep 30, 2025 at 01:58:56AM +0200, Stefano Brivio wrote:
> On Sat, 27 Sep 2025 15:25:16 -0400
> Jon Maloy <jmaloy@redhat.com> wrote:
>
> > Gratuitious ARP and unsolicitated NA should be handled with caution
> > because of the risk of malignant users emitting them to disturb
> > network communication.
> >
> > There is however one case we where we know it is legitimate
> > and safe for us to send out such messages: The one time we switch
> > from using ctx->own_tap_mac to a MAC address received via the
> > recently added neigbour subscription function. Later changes to
> > the MAC address of a host in an existing entry cannot be fully
> > trusted, so we abstain from doing it in such cases.
> >
> > When sending this type of messages, we notice that the guest accepts
> > the update, but also asks for a confirmation in the form of a regular
> > ARP/NS request. This is responded to with the new value, and we have
> > exactly the effect we wanted.
> >
> > This commit adds this functionality.
> >
> > Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> >
> > ---
> > v10: -Made small changes based of feedback from David G.
> > v11: -Moved from 'Gratuitous ARP reply' model to 'ARP Announcement'
> > model.
> > ---
> > arp.c | 41 +++++++++++++++++++++++++++++++++++++++++
> > arp.h | 2 ++
> > fwd.c | 8 ++++++++
> > ndp.c | 10 ++++++++++
> > ndp.h | 1 +
> > 5 files changed, 62 insertions(+)
> >
> > diff --git a/arp.c b/arp.c
> > index ad088b1..57e7b59 100644
> > --- a/arp.c
> > +++ b/arp.c
> > @@ -146,3 +146,44 @@ void arp_send_init_req(const struct ctx *c)
> > debug("Sending initial ARP request for guest MAC address");
> > tap_send_single(c, &req, sizeof(req));
> > }
> > +
> > +/**
> > + * arp_send_gratuitous() - Send a gratuitous ARP announcement for an IPv4 host
> > + * @c: Execution context
> > + * @ip: IPv4 address we announce as owned by @mac
> > + * @mac: MAC address to advertise for @ip
> > + */
> > +void arp_send_gratuitous(const struct ctx *c, struct in_addr *ip,
> > + const unsigned char *mac)
> > +{
> > + char ip_str[INET_ADDRSTRLEN];
> > + struct {
> > + struct ethhdr eh;
> > + struct arphdr ah;
> > + struct arpmsg am;
> > + } __attribute__((__packed__)) annc;
> > +
> > + /* Ethernet header */
> > + annc.eh.h_proto = htons(ETH_P_ARP);
> > + memcpy(annc.eh.h_dest, MAC_BROADCAST, sizeof(annc.eh.h_dest));
> > + memcpy(annc.eh.h_source, mac, sizeof(annc.eh.h_source));
> > +
> > + /* ARP header */
> > + annc.ah.ar_op = htons(ARPOP_REQUEST);
> > + annc.ah.ar_hrd = htons(ARPHRD_ETHER);
> > + annc.ah.ar_pro = htons(ETH_P_IP);
> > + annc.ah.ar_hln = ETH_ALEN;
> > + annc.ah.ar_pln = 4;
> > +
> > + /* ARP message */
> > + memcpy(annc.am.sha, mac, sizeof(annc.am.sha));
> > + memcpy(annc.am.sip, ip, sizeof(annc.am.sip));
> > + memcpy(annc.am.tha, MAC_BROADCAST, sizeof(annc.am.tha));
> > + memcpy(annc.am.tip, ip, sizeof(annc.am.tip));
> > +
> > + inet_ntop(AF_INET, ip, ip_str, sizeof(ip_str));
> > + debug("Sending ARP announcement for %s", ip_str);
> > +
> > + tap_send_single(c, &annc, sizeof(annc));
> > +}
> > +
>
> Nit: git show / log highlights this excess newline in red, and I merely
> relay the pedantry.
>
> > diff --git a/arp.h b/arp.h
> > index d5ad0e1..2cf1326 100644
> > --- a/arp.h
> > +++ b/arp.h
> > @@ -22,5 +22,7 @@ struct arpmsg {
> >
> > int arp(const struct ctx *c, struct iov_tail *data);
> > void arp_send_init_req(const struct ctx *c);
> > +void arp_send_gratuitous(const struct ctx *c, struct in_addr *ip,
> > + const unsigned char *mac);
> >
> > #endif /* ARP_H */
> > diff --git a/fwd.c b/fwd.c
> > index 2fd6cee..7f38b40 100644
> > --- a/fwd.c
> > +++ b/fwd.c
> > @@ -26,6 +26,8 @@
> > #include "passt.h"
> > #include "lineread.h"
> > #include "flow_table.h"
> > +#include "arp.h"
> > +#include "ndp.h"
> >
> > /* Empheral port range: values from RFC 6335 */
> > static in_port_t fwd_ephemeral_min = (1 << 15) + (1 << 14);
> > @@ -131,6 +133,12 @@ void fwd_neigh_table_update(const struct ctx *c,
> >
> > memcpy(&e->addr, addr, sizeof(*addr));
> > memcpy(e->mac, mac, ETH_ALEN);
> > +
> > + /* Send gratuitous ARP / unsolicited NA for the new mapping */
> > + if (inany_v4(addr))
> > + arp_send_gratuitous(c, inany_v4(addr), e->mac);
> > + else
> > + ndp_send_unsolicited_na(c, &addr->a6);
> > }
> >
> > /**
> > diff --git a/ndp.c b/ndp.c
> > index 588b48f..d7f64a3 100644
> > --- a/ndp.c
> > +++ b/ndp.c
> > @@ -218,6 +218,16 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
> > ndp_send(c, dst, &na, sizeof(na));
> > }
> >
> > +/**
> > + * ndp_send_unsolicited_na() - Send unsolicited NA
>
> This comment, strictly speaking, should be related to the notification
> mechanism, but the usage here makes me wonder: do we want to announce
> link-layer addresses for link-local (IPv6) addresses as well?
>
> Those are not reachable from the guest. I'm not sure if we want to
> filter IPv6 addresses a bit, in general. I didn't really think it
> through.
I think (host) link-local addresses are actually reachable from the
guest. I think the semantics of that are kind of confusing (at least
with multiple host links), and I've talked about disallowing that when
we don't have an explicitly selected host interface we're dealing
with. But I haven't done that yet.
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v11 4/9] arp/ndp: respond with true MAC address of LAN local remote hosts
2025-09-27 19:25 [PATCH v11 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
` (2 preceding siblings ...)
2025-09-27 19:25 ` [PATCH v11 3/9] arp/ndp: send gratuitous ARP / unsolicitated NA when MAC cache entry added Jon Maloy
@ 2025-09-27 19:25 ` Jon Maloy
2025-09-30 21:29 ` Stefano Brivio
2025-09-27 19:25 ` [PATCH v11 5/9] flow: add MAC address of LAN local remote hosts to flow Jon Maloy
` (4 subsequent siblings)
8 siblings, 1 reply; 27+ messages in thread
From: Jon Maloy @ 2025-09-27 19:25 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
When we receive an ARP request or NDP neigbor solicitation over
the tap interface for a host on the local network segment attached
to the template interface, we respond with that host's real MAC
address.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
v3: - Added helper function to find out if a remote ip address is subject
to NAT. This filters out local host addresses which should be
presented with the passt/pasta local MAC address 9a:55:9a:55:9a:55 even
though it is on the local segment.
- Adapted to the change in nl_mac_get() function, so that we now consider
only the template interface when checking the ARP/NDP table.
v4: - Moved NAT check into the function nat_outbound() to obtain more
precise criteria for when NAT is used. We may in theory
have NAT even if original and translated addresses are equal, and
we want to catch this case.
- I chose to keep the wrapper funtion inany_nat(), but moved it to
fwd.h/fwd.c and renamed it to fwd_inany_nat().
v5: - Simplified criteria for when we do ARP/NDP lookup. Now, we
just try with the potentially translated address after an
attempted NAT check.
- Using the new ARP/NDP cache table instead of using netlink
directly.
v6: - Fixes after feedback from David:
- Renamed nat_outbound() to fwd_nat_outbound()
- Eliminated unnecessary temporary variable in arp.c::arp()
---
arp.c | 9 +++++++--
fwd.c | 8 ++++----
fwd.h | 2 ++
inany.c | 1 +
ndp.c | 8 ++++++++
5 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/arp.c b/arp.c
index 57e7b59..4f6b7df 100644
--- a/arp.c
+++ b/arp.c
@@ -69,6 +69,7 @@ static bool ignore_arp(const struct ctx *c,
*/
int arp(const struct ctx *c, struct iov_tail *data)
{
+ union inany_addr tgt, tgt_nat;
struct {
struct ethhdr eh;
struct arphdr ah;
@@ -102,8 +103,12 @@ int arp(const struct ctx *c, struct iov_tail *data)
resp.ah.ar_hln = ah->ar_hln;
resp.ah.ar_pln = ah->ar_pln;
- /* ARP message */
- memcpy(resp.am.sha, c->our_tap_mac, sizeof(resp.am.sha));
+ /* MAC address to return in ARP message */
+ inany_from_af(&tgt, AF_INET, am->tip);
+ fwd_nat_outbound(c, &tgt, &tgt_nat);
+ fwd_neigh_mac_get(c, &tgt_nat, resp.am.sha);
+
+ /* Rest of ARP message */
memcpy(resp.am.sip, am->tip, sizeof(resp.am.sip));
memcpy(resp.am.tha, am->sha, sizeof(resp.am.tha));
memcpy(resp.am.tip, am->sip, sizeof(resp.am.tip));
diff --git a/fwd.c b/fwd.c
index 7f38b40..fef9d3d 100644
--- a/fwd.c
+++ b/fwd.c
@@ -496,7 +496,7 @@ static bool fwd_guest_accessible(const struct ctx *c,
}
/**
- * nat_outbound() - Apply address translation for outbound (TAP to HOST)
+ * fwd_nat_outbound() - Apply address translation for outbound (TAP to HOST)
* @c: Execution context
* @addr: Input address (as seen on TAP interface)
* @translated: Output address (as seen on HOST interface)
@@ -504,8 +504,8 @@ static bool fwd_guest_accessible(const struct ctx *c,
* Only handles translations that depend *only* on the address. Anything
* related to specific ports or flows is handled elsewhere.
*/
-static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
- union inany_addr *translated)
+void fwd_nat_outbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated)
{
if (inany_equals4(addr, &c->ip4.map_host_loopback))
*translated = inany_loopback4;
@@ -539,7 +539,7 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
inany_equals6(&ini->oaddr, &c->ip6.dns_match))
tgt->eaddr.a6 = c->ip6.dns_host;
else
- nat_outbound(c, &ini->oaddr, &tgt->eaddr);
+ fwd_nat_outbound(c, &ini->oaddr, &tgt->eaddr);
tgt->eport = ini->oport;
diff --git a/fwd.h b/fwd.h
index 5464349..241f527 100644
--- a/fwd.h
+++ b/fwd.h
@@ -50,6 +50,8 @@ void fwd_scan_ports_init(struct ctx *c);
bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
union inany_addr *translated);
+void fwd_nat_outbound(const struct ctx *c, const union inany_addr *addr,
+ union inany_addr *translated);
uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
const struct flowside *ini, struct flowside *tgt);
uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
diff --git a/inany.c b/inany.c
index 65a39f9..7680439 100644
--- a/inany.c
+++ b/inany.c
@@ -16,6 +16,7 @@
#include "ip.h"
#include "siphash.h"
#include "inany.h"
+#include "fwd.h"
const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT);
const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
diff --git a/ndp.c b/ndp.c
index d7f64a3..610a06c 100644
--- a/ndp.c
+++ b/ndp.c
@@ -196,6 +196,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
const struct in6_addr *addr)
{
+ union inany_addr tgt, tgt_nat;
struct ndp_na na = {
.ih = {
.icmp6_type = NA,
@@ -215,6 +216,13 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
+ /* Respond with true MAC address if remote host's address or
+ * NAT translated address can be found in NDP table.
+ */
+ inany_from_af(&tgt, AF_INET6, addr);
+ fwd_nat_outbound(c, &tgt, &tgt_nat);
+ fwd_neigh_mac_get(c, &tgt_nat, na.target_l2_addr.mac);
+
ndp_send(c, dst, &na, sizeof(na));
}
--
2.50.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 4/9] arp/ndp: respond with true MAC address of LAN local remote hosts
2025-09-27 19:25 ` [PATCH v11 4/9] arp/ndp: respond with true MAC address of LAN local remote hosts Jon Maloy
@ 2025-09-30 21:29 ` Stefano Brivio
0 siblings, 0 replies; 27+ messages in thread
From: Stefano Brivio @ 2025-09-30 21:29 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
Two nits only:
On Sat, 27 Sep 2025 15:25:17 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> When we receive an ARP request or NDP neigbor solicitation over
neighbour (it doesn't matter so much but I actually grep through commit
messages from time to time...)
> the tap interface for a host on the local network segment attached
> to the template interface, we respond with that host's real MAC
> address.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> ---
> v3: - Added helper function to find out if a remote ip address is subject
> to NAT. This filters out local host addresses which should be
> presented with the passt/pasta local MAC address 9a:55:9a:55:9a:55 even
> though it is on the local segment.
> - Adapted to the change in nl_mac_get() function, so that we now consider
> only the template interface when checking the ARP/NDP table.
> v4: - Moved NAT check into the function nat_outbound() to obtain more
> precise criteria for when NAT is used. We may in theory
> have NAT even if original and translated addresses are equal, and
> we want to catch this case.
> - I chose to keep the wrapper funtion inany_nat(), but moved it to
> fwd.h/fwd.c and renamed it to fwd_inany_nat().
> v5: - Simplified criteria for when we do ARP/NDP lookup. Now, we
> just try with the potentially translated address after an
> attempted NAT check.
> - Using the new ARP/NDP cache table instead of using netlink
> directly.
> v6: - Fixes after feedback from David:
> - Renamed nat_outbound() to fwd_nat_outbound()
> - Eliminated unnecessary temporary variable in arp.c::arp()
> ---
> arp.c | 9 +++++++--
> fwd.c | 8 ++++----
> fwd.h | 2 ++
> inany.c | 1 +
> ndp.c | 8 ++++++++
> 5 files changed, 22 insertions(+), 6 deletions(-)
>
> diff --git a/arp.c b/arp.c
> index 57e7b59..4f6b7df 100644
> --- a/arp.c
> +++ b/arp.c
> @@ -69,6 +69,7 @@ static bool ignore_arp(const struct ctx *c,
> */
> int arp(const struct ctx *c, struct iov_tail *data)
> {
> + union inany_addr tgt, tgt_nat;
> struct {
> struct ethhdr eh;
> struct arphdr ah;
> @@ -102,8 +103,12 @@ int arp(const struct ctx *c, struct iov_tail *data)
> resp.ah.ar_hln = ah->ar_hln;
> resp.ah.ar_pln = ah->ar_pln;
>
> - /* ARP message */
> - memcpy(resp.am.sha, c->our_tap_mac, sizeof(resp.am.sha));
> + /* MAC address to return in ARP message */
> + inany_from_af(&tgt, AF_INET, am->tip);
> + fwd_nat_outbound(c, &tgt, &tgt_nat);
> + fwd_neigh_mac_get(c, &tgt_nat, resp.am.sha);
> +
> + /* Rest of ARP message */
> memcpy(resp.am.sip, am->tip, sizeof(resp.am.sip));
> memcpy(resp.am.tha, am->sha, sizeof(resp.am.tha));
> memcpy(resp.am.tip, am->sip, sizeof(resp.am.tip));
> diff --git a/fwd.c b/fwd.c
> index 7f38b40..fef9d3d 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -496,7 +496,7 @@ static bool fwd_guest_accessible(const struct ctx *c,
> }
>
> /**
> - * nat_outbound() - Apply address translation for outbound (TAP to HOST)
> + * fwd_nat_outbound() - Apply address translation for outbound (TAP to HOST)
> * @c: Execution context
> * @addr: Input address (as seen on TAP interface)
> * @translated: Output address (as seen on HOST interface)
> @@ -504,8 +504,8 @@ static bool fwd_guest_accessible(const struct ctx *c,
> * Only handles translations that depend *only* on the address. Anything
> * related to specific ports or flows is handled elsewhere.
> */
> -static void nat_outbound(const struct ctx *c, const union inany_addr *addr,
> - union inany_addr *translated)
> +void fwd_nat_outbound(const struct ctx *c, const union inany_addr *addr,
> + union inany_addr *translated)
> {
> if (inany_equals4(addr, &c->ip4.map_host_loopback))
> *translated = inany_loopback4;
> @@ -539,7 +539,7 @@ uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
> inany_equals6(&ini->oaddr, &c->ip6.dns_match))
> tgt->eaddr.a6 = c->ip6.dns_host;
> else
> - nat_outbound(c, &ini->oaddr, &tgt->eaddr);
> + fwd_nat_outbound(c, &ini->oaddr, &tgt->eaddr);
>
> tgt->eport = ini->oport;
>
> diff --git a/fwd.h b/fwd.h
> index 5464349..241f527 100644
> --- a/fwd.h
> +++ b/fwd.h
> @@ -50,6 +50,8 @@ void fwd_scan_ports_init(struct ctx *c);
>
> bool nat_inbound(const struct ctx *c, const union inany_addr *addr,
> union inany_addr *translated);
> +void fwd_nat_outbound(const struct ctx *c, const union inany_addr *addr,
> + union inany_addr *translated);
> uint8_t fwd_nat_from_tap(const struct ctx *c, uint8_t proto,
> const struct flowside *ini, struct flowside *tgt);
> uint8_t fwd_nat_from_splice(const struct ctx *c, uint8_t proto,
> diff --git a/inany.c b/inany.c
> index 65a39f9..7680439 100644
> --- a/inany.c
> +++ b/inany.c
> @@ -16,6 +16,7 @@
> #include "ip.h"
> #include "siphash.h"
> #include "inany.h"
> +#include "fwd.h"
>
> const union inany_addr inany_loopback4 = INANY_INIT4(IN4ADDR_LOOPBACK_INIT);
> const union inany_addr inany_any4 = INANY_INIT4(IN4ADDR_ANY_INIT);
> diff --git a/ndp.c b/ndp.c
> index d7f64a3..610a06c 100644
> --- a/ndp.c
> +++ b/ndp.c
> @@ -196,6 +196,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
> static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
> const struct in6_addr *addr)
> {
> + union inany_addr tgt, tgt_nat;
> struct ndp_na na = {
> .ih = {
> .icmp6_type = NA,
> @@ -215,6 +216,13 @@ static void ndp_na(const struct ctx *c, const struct in6_addr *dst,
>
> memcpy(na.target_l2_addr.mac, c->our_tap_mac, ETH_ALEN);
>
> + /* Respond with true MAC address if remote host's address or
Maybe s/true/recorded/? Outside of the context of this series,
otherwise, it's pretty hard to figure out what you mean with "true"
here.
> + * NAT translated address can be found in NDP table.
> + */
> + inany_from_af(&tgt, AF_INET6, addr);
> + fwd_nat_outbound(c, &tgt, &tgt_nat);
> + fwd_neigh_mac_get(c, &tgt_nat, na.target_l2_addr.mac);
> +
> ndp_send(c, dst, &na, sizeof(na));
> }
--
Stefano
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v11 5/9] flow: add MAC address of LAN local remote hosts to flow
2025-09-27 19:25 [PATCH v11 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
` (3 preceding siblings ...)
2025-09-27 19:25 ` [PATCH v11 4/9] arp/ndp: respond with true MAC address of LAN local remote hosts Jon Maloy
@ 2025-09-27 19:25 ` Jon Maloy
2025-09-30 21:29 ` Stefano Brivio
2025-09-27 19:25 ` [PATCH v11 6/9] udp: forward external source MAC address through tap interface Jon Maloy
` (3 subsequent siblings)
8 siblings, 1 reply; 27+ messages in thread
From: Jon Maloy @ 2025-09-27 19:25 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
When communicating with remote hosts on the local network, some guest
applications want to see the real MAC address of that host instead
of PASST/PASTA's own tap address. The flow_common structure is a
convenient location for storing that address, so we do that in this
commit.
Note that we don´t add actual usage of this address here, that will
be done in later commits.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
v3: - Moved the remote host macaddress from struct flowside to
struct flow_common. I chose to call it 'omac' as suggested
by David, although in my understanding the correct name would be
'emac'. (In general I find the address naming scheme confusing.)
- Adapted to new signature of function nl_mac_get(), now passing
it the index of the template interface.
v4: - Renamed flow_commeon->omac to flow_common->tap_omac to make is
role in the code clearer
v5: - Modified the criteria for ARP/NDP table lookup like in the
previous commits.
- Removed the PIF_TAP lookup case, as David suggested, and did
instead give the flow->tap_omac field a value marking it as
non-initialized.
- Calling the cache table instead of netlink for ARP/NDP lookup.
- Unconditionally using the potentially translated IP address
in the lookup, instead of only if NAT really was applied.
v6: - Using MAC_ZERO instead of own definitions
---
flow.c | 2 ++
flow.h | 2 ++
2 files changed, 4 insertions(+)
diff --git a/flow.c b/flow.c
index feefda3..510f3c5 100644
--- a/flow.c
+++ b/flow.c
@@ -449,6 +449,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
switch (f->pif[INISIDE]) {
case PIF_TAP:
+ memcpy(f->tap_omac, MAC_ZERO, ETH_ALEN);
tgtpif = fwd_nat_from_tap(c, proto, ini, tgt);
break;
@@ -458,6 +459,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
case PIF_HOST:
tgtpif = fwd_nat_from_host(c, proto, ini, tgt);
+ fwd_neigh_mac_get(c, &ini->eaddr, f->tap_omac);
break;
default:
diff --git a/flow.h b/flow.h
index cac618a..f342895 100644
--- a/flow.h
+++ b/flow.h
@@ -177,6 +177,7 @@ int flowside_connect(const struct ctx *c, int s,
* @type: Type of packet flow
* @pif[]: Interface for each side of the flow
* @side[]: Information for each side of the flow
+ * @tap_omac: MAC address of remote endpoint as seen from the guest
*/
struct flow_common {
#ifdef __GNUC__
@@ -192,6 +193,7 @@ struct flow_common {
#endif
uint8_t pif[SIDES];
struct flowside side[SIDES];
+ uint8_t tap_omac[6];
};
#define FLOW_INDEX_BITS 17 /* 128k - 1 */
--
2.50.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 5/9] flow: add MAC address of LAN local remote hosts to flow
2025-09-27 19:25 ` [PATCH v11 5/9] flow: add MAC address of LAN local remote hosts to flow Jon Maloy
@ 2025-09-30 21:29 ` Stefano Brivio
2025-10-01 0:17 ` David Gibson
0 siblings, 1 reply; 27+ messages in thread
From: Stefano Brivio @ 2025-09-30 21:29 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
On Sat, 27 Sep 2025 15:25:18 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> When communicating with remote hosts on the local network, some guest
> applications want to see the real MAC address of that host instead
> of PASST/PASTA's own tap address. The flow_common structure is a
> convenient location for storing that address, so we do that in this
> commit.
>
> Note that we don´t add actual usage of this address here, that will
> be done in later commits.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> ---
> v3: - Moved the remote host macaddress from struct flowside to
> struct flow_common. I chose to call it 'omac' as suggested
> by David, although in my understanding the correct name would be
> 'emac'. (In general I find the address naming scheme confusing.)
> - Adapted to new signature of function nl_mac_get(), now passing
> it the index of the template interface.
> v4: - Renamed flow_commeon->omac to flow_common->tap_omac to make is
> role in the code clearer
> v5: - Modified the criteria for ARP/NDP table lookup like in the
> previous commits.
> - Removed the PIF_TAP lookup case, as David suggested, and did
> instead give the flow->tap_omac field a value marking it as
> non-initialized.
> - Calling the cache table instead of netlink for ARP/NDP lookup.
> - Unconditionally using the potentially translated IP address
> in the lookup, instead of only if NAT really was applied.
> v6: - Using MAC_ZERO instead of own definitions
> ---
> flow.c | 2 ++
> flow.h | 2 ++
> 2 files changed, 4 insertions(+)
>
> diff --git a/flow.c b/flow.c
> index feefda3..510f3c5 100644
> --- a/flow.c
> +++ b/flow.c
> @@ -449,6 +449,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
>
> switch (f->pif[INISIDE]) {
> case PIF_TAP:
> + memcpy(f->tap_omac, MAC_ZERO, ETH_ALEN);
I see in the next patch that this is needed as an invalid value for
f->tap_omac, but MAC_ZERO is actually a valid, usable MAC address.
I guess we should use ff:ff:ff:ff:ff:ff, instead, as MAC_ONES, or
MAC_UNSPEC.
> tgtpif = fwd_nat_from_tap(c, proto, ini, tgt);
> break;
>
> @@ -458,6 +459,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
>
> case PIF_HOST:
> tgtpif = fwd_nat_from_host(c, proto, ini, tgt);
> + fwd_neigh_mac_get(c, &ini->eaddr, f->tap_omac);
> break;
>
> default:
> diff --git a/flow.h b/flow.h
> index cac618a..f342895 100644
> --- a/flow.h
> +++ b/flow.h
> @@ -177,6 +177,7 @@ int flowside_connect(const struct ctx *c, int s,
> * @type: Type of packet flow
> * @pif[]: Interface for each side of the flow
> * @side[]: Information for each side of the flow
> + * @tap_omac: MAC address of remote endpoint as seen from the guest
The descriptions of the other fields are aligned with tabs, this has a
single space, so it's not aligned.
> */
> struct flow_common {
> #ifdef __GNUC__
> @@ -192,6 +193,7 @@ struct flow_common {
> #endif
> uint8_t pif[SIDES];
> struct flowside side[SIDES];
> + uint8_t tap_omac[6];
> };
>
> #define FLOW_INDEX_BITS 17 /* 128k - 1 */
--
Stefano
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 5/9] flow: add MAC address of LAN local remote hosts to flow
2025-09-30 21:29 ` Stefano Brivio
@ 2025-10-01 0:17 ` David Gibson
0 siblings, 0 replies; 27+ messages in thread
From: David Gibson @ 2025-10-01 0:17 UTC (permalink / raw)
To: Stefano Brivio; +Cc: Jon Maloy, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 2661 bytes --]
On Tue, Sep 30, 2025 at 11:29:32PM +0200, Stefano Brivio wrote:
> On Sat, 27 Sep 2025 15:25:18 -0400
> Jon Maloy <jmaloy@redhat.com> wrote:
>
> > When communicating with remote hosts on the local network, some guest
> > applications want to see the real MAC address of that host instead
> > of PASST/PASTA's own tap address. The flow_common structure is a
> > convenient location for storing that address, so we do that in this
> > commit.
> >
> > Note that we don´t add actual usage of this address here, that will
> > be done in later commits.
> >
> > Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> > Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> >
> > ---
> > v3: - Moved the remote host macaddress from struct flowside to
> > struct flow_common. I chose to call it 'omac' as suggested
> > by David, although in my understanding the correct name would be
> > 'emac'. (In general I find the address naming scheme confusing.)
> > - Adapted to new signature of function nl_mac_get(), now passing
> > it the index of the template interface.
> > v4: - Renamed flow_commeon->omac to flow_common->tap_omac to make is
> > role in the code clearer
> > v5: - Modified the criteria for ARP/NDP table lookup like in the
> > previous commits.
> > - Removed the PIF_TAP lookup case, as David suggested, and did
> > instead give the flow->tap_omac field a value marking it as
> > non-initialized.
> > - Calling the cache table instead of netlink for ARP/NDP lookup.
> > - Unconditionally using the potentially translated IP address
> > in the lookup, instead of only if NAT really was applied.
> > v6: - Using MAC_ZERO instead of own definitions
> > ---
> > flow.c | 2 ++
> > flow.h | 2 ++
> > 2 files changed, 4 insertions(+)
> >
> > diff --git a/flow.c b/flow.c
> > index feefda3..510f3c5 100644
> > --- a/flow.c
> > +++ b/flow.c
> > @@ -449,6 +449,7 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
> >
> > switch (f->pif[INISIDE]) {
> > case PIF_TAP:
> > + memcpy(f->tap_omac, MAC_ZERO, ETH_ALEN);
>
> I see in the next patch that this is needed as an invalid value for
> f->tap_omac, but MAC_ZERO is actually a valid, usable MAC address.
Good point, I'd forgotten about that.
> I guess we should use ff:ff:ff:ff:ff:ff, instead, as MAC_ONES, or
> MAC_UNSPEC.
We already have MAC_BROADCAST.
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v11 6/9] udp: forward external source MAC address through tap interface
2025-09-27 19:25 [PATCH v11 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
` (4 preceding siblings ...)
2025-09-27 19:25 ` [PATCH v11 5/9] flow: add MAC address of LAN local remote hosts to flow Jon Maloy
@ 2025-09-27 19:25 ` Jon Maloy
2025-09-30 21:29 ` Stefano Brivio
2025-09-30 21:29 ` Stefano Brivio
2025-09-27 19:25 ` [PATCH v11 7/9] tcp: " Jon Maloy
` (2 subsequent siblings)
8 siblings, 2 replies; 27+ messages in thread
From: Jon Maloy @ 2025-09-27 19:25 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
We forward the incoming MAC address through the tap interface when
receiving incoming packets from network local hosts.
This is a part of the solution to bug
https://bugs.passt.top/show_bug.cgi?id=120
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
v3: - Adapted to the move of external MAC address from struct flowside
to struct flow_common
v4: - Changed signature of udp_tap_prepare() to take a MAC address
instead of a flow.
- Eliminated initialization of MAC source address in all frames,
since those now are set per send occasion anyway.
v5: - Added lookup in ARP/NDP table on incoming messages in
case flow->tap_omac wasn't initialized at flow creation,
i.e., the flow was initiated from the guest.
v6: - Using MAC_ZERO
---
passt.c | 2 +-
udp.c | 45 +++++++++++++++++++++++++--------------------
udp.h | 2 +-
3 files changed, 27 insertions(+), 22 deletions(-)
diff --git a/passt.c b/passt.c
index e20bbad..fdd275a 100644
--- a/passt.c
+++ b/passt.c
@@ -164,7 +164,7 @@ static void timer_init(struct ctx *c, const struct timespec *now)
void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
{
tcp_update_l2_buf(eth_d, eth_s);
- udp_update_l2_buf(eth_d, eth_s);
+ udp_update_l2_buf(eth_d);
}
/**
diff --git a/udp.c b/udp.c
index 86585b7..eb57f05 100644
--- a/udp.c
+++ b/udp.c
@@ -133,11 +133,8 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
/* UDP header and data for inbound messages */
static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
-/* Ethernet header for IPv4 frames */
-static struct ethhdr udp4_eth_hdr;
-
-/* Ethernet header for IPv6 frames */
-static struct ethhdr udp6_eth_hdr;
+/* Ethernet headers for IPv4 and IPv6 frames */
+static struct ethhdr udp_eth_hdr[UDP_MAX_FRAMES];
/**
* struct udp_meta_t - Pre-cooked headers for UDP packets
@@ -210,12 +207,13 @@ void udp_portmap_clear(void)
/**
* udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void udp_update_l2_buf(const unsigned char *eth_d)
{
- eth_update_mac(&udp4_eth_hdr, eth_d, eth_s);
- eth_update_mac(&udp6_eth_hdr, eth_d, eth_s);
+ int i;
+
+ for (i = 0; i < UDP_MAX_FRAMES; i++)
+ eth_update_mac(&udp_eth_hdr[i], eth_d, NULL);
}
/**
@@ -238,6 +236,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
*siov = IOV_OF_LVALUE(payload->data);
+ tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp_eth_hdr[i]);
tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
tiov[UDP_IOV_PAYLOAD].iov_base = payload;
@@ -253,9 +252,6 @@ static void udp_iov_init(const struct ctx *c)
{
size_t i;
- udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP);
- udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6);
-
for (i = 0; i < UDP_MAX_FRAMES; i++)
udp_iov_init_one(c, i);
}
@@ -352,31 +348,34 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
* udp_tap_prepare() - Convert one datagram into a tap frame
* @mmh: Receiving mmsghdr array
* @idx: Index of the datagram to prepare
+ * @tap_omac: MAC address of remote endpoint as seen from the guest
* @toside: Flowside for destination side
* @no_udp_csum: Do not set UDP checksum
*/
static void udp_tap_prepare(const struct mmsghdr *mmh,
- unsigned idx, const struct flowside *toside,
+ unsigned int idx,
+ const uint8_t *tap_omac,
+ const struct flowside *toside,
bool no_udp_csum)
{
struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
struct udp_payload_t *bp = &udp_payload[idx];
struct udp_meta_t *bm = &udp_meta[idx];
+ struct ethhdr *eh = (*tap_iov)[UDP_IOV_ETH].iov_base;
size_t l4len;
+ eth_update_mac(eh, NULL, tap_omac);
if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
- sizeof(udp6_eth_hdr));
- (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
+ tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + ETH_HLEN);
+ eh->h_proto = htons_constant(ETH_P_IPV6);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
} else {
l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
mmh[idx].msg_len, no_udp_csum);
- tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
- sizeof(udp4_eth_hdr));
- (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
+ tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + ETH_HLEN);
+ eh->h_proto = htons_constant(ETH_P_IP);
(*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h);
}
(*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
@@ -801,13 +800,19 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
flow_sidx_t tosidx)
{
const struct flowside *toside = flowside_at_sidx(tosidx);
+ struct udp_flow *uflow = udp_at_sidx(tosidx);
+ uint8_t *omac = uflow->f.tap_omac;
int i;
if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
return;
+ /* Make one attempt to find true MAC address in ARP/NDP table */
+ if (MAC_IS_ZERO(omac))
+ fwd_neigh_mac_get(c, &toside->oaddr, omac);
+
for (i = 0; i < n; i++)
- udp_tap_prepare(udp_mh_recv, i, toside, false);
+ udp_tap_prepare(udp_mh_recv, i, omac, toside, false);
tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
}
diff --git a/udp.h b/udp.h
index 8f8531a..dd6e5ad 100644
--- a/udp.h
+++ b/udp.h
@@ -21,7 +21,7 @@ int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
const char *ifname, in_port_t port);
int udp_init(struct ctx *c);
void udp_timer(struct ctx *c, const struct timespec *now);
-void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
+void udp_update_l2_buf(const unsigned char *eth_d);
/**
* union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets
--
2.50.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 6/9] udp: forward external source MAC address through tap interface
2025-09-27 19:25 ` [PATCH v11 6/9] udp: forward external source MAC address through tap interface Jon Maloy
@ 2025-09-30 21:29 ` Stefano Brivio
2025-09-30 21:29 ` Stefano Brivio
1 sibling, 0 replies; 27+ messages in thread
From: Stefano Brivio @ 2025-09-30 21:29 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
On Sat, 27 Sep 2025 15:25:19 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> We forward the incoming MAC address through the tap interface when
> receiving incoming packets from network local hosts.
>
> This is a part of the solution to bug
> https://bugs.passt.top/show_bug.cgi?id=120
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> ---
> v3: - Adapted to the move of external MAC address from struct flowside
> to struct flow_common
> v4: - Changed signature of udp_tap_prepare() to take a MAC address
> instead of a flow.
> - Eliminated initialization of MAC source address in all frames,
> since those now are set per send occasion anyway.
> v5: - Added lookup in ARP/NDP table on incoming messages in
> case flow->tap_omac wasn't initialized at flow creation,
> i.e., the flow was initiated from the guest.
> v6: - Using MAC_ZERO
> ---
> passt.c | 2 +-
> udp.c | 45 +++++++++++++++++++++++++--------------------
> udp.h | 2 +-
> 3 files changed, 27 insertions(+), 22 deletions(-)
>
> diff --git a/passt.c b/passt.c
> index e20bbad..fdd275a 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -164,7 +164,7 @@ static void timer_init(struct ctx *c, const struct timespec *now)
> void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> {
> tcp_update_l2_buf(eth_d, eth_s);
> - udp_update_l2_buf(eth_d, eth_s);
> + udp_update_l2_buf(eth_d);
> }
>
> /**
> diff --git a/udp.c b/udp.c
> index 86585b7..eb57f05 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -133,11 +133,8 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
> /* UDP header and data for inbound messages */
> static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
>
> -/* Ethernet header for IPv4 frames */
> -static struct ethhdr udp4_eth_hdr;
> -
> -/* Ethernet header for IPv6 frames */
> -static struct ethhdr udp6_eth_hdr;
> +/* Ethernet headers for IPv4 and IPv6 frames */
> +static struct ethhdr udp_eth_hdr[UDP_MAX_FRAMES];
>
> /**
> * struct udp_meta_t - Pre-cooked headers for UDP packets
> @@ -210,12 +207,13 @@ void udp_portmap_clear(void)
> /**
> * udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
> * @eth_d: Ethernet destination address, NULL if unchanged
> - * @eth_s: Ethernet source address, NULL if unchanged
> */
> -void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> +void udp_update_l2_buf(const unsigned char *eth_d)
> {
> - eth_update_mac(&udp4_eth_hdr, eth_d, eth_s);
> - eth_update_mac(&udp6_eth_hdr, eth_d, eth_s);
> + int i;
> +
> + for (i = 0; i < UDP_MAX_FRAMES; i++)
> + eth_update_mac(&udp_eth_hdr[i], eth_d, NULL);
> }
>
> /**
> @@ -238,6 +236,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
>
> *siov = IOV_OF_LVALUE(payload->data);
>
> + tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp_eth_hdr[i]);
> tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
> tiov[UDP_IOV_PAYLOAD].iov_base = payload;
>
> @@ -253,9 +252,6 @@ static void udp_iov_init(const struct ctx *c)
> {
> size_t i;
>
> - udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP);
> - udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6);
> -
> for (i = 0; i < UDP_MAX_FRAMES; i++)
> udp_iov_init_one(c, i);
> }
> @@ -352,31 +348,34 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
> * udp_tap_prepare() - Convert one datagram into a tap frame
> * @mmh: Receiving mmsghdr array
> * @idx: Index of the datagram to prepare
> + * @tap_omac: MAC address of remote endpoint as seen from the guest
> * @toside: Flowside for destination side
> * @no_udp_csum: Do not set UDP checksum
> */
> static void udp_tap_prepare(const struct mmsghdr *mmh,
> - unsigned idx, const struct flowside *toside,
> + unsigned int idx,
> + const uint8_t *tap_omac,
> + const struct flowside *toside,
> bool no_udp_csum)
> {
> struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
> struct udp_payload_t *bp = &udp_payload[idx];
> struct udp_meta_t *bm = &udp_meta[idx];
> + struct ethhdr *eh = (*tap_iov)[UDP_IOV_ETH].iov_base;
Nit: this should be moved as second declaration.
> size_t l4len;
>
> + eth_update_mac(eh, NULL, tap_omac);
> if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
> l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
> mmh[idx].msg_len, no_udp_csum);
> - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
> - sizeof(udp6_eth_hdr));
> - (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
> + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + ETH_HLEN);
> + eh->h_proto = htons_constant(ETH_P_IPV6);
> (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
> } else {
> l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
> mmh[idx].msg_len, no_udp_csum);
> - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
> - sizeof(udp4_eth_hdr));
> - (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
> + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + ETH_HLEN);
> + eh->h_proto = htons_constant(ETH_P_IP);
> (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h);
> }
> (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
> @@ -801,13 +800,19 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
> flow_sidx_t tosidx)
> {
> const struct flowside *toside = flowside_at_sidx(tosidx);
> + struct udp_flow *uflow = udp_at_sidx(tosidx);
> + uint8_t *omac = uflow->f.tap_omac;
> int i;
>
> if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
> return;
>
> + /* Make one attempt to find true MAC address in ARP/NDP table */
> + if (MAC_IS_ZERO(omac))
> + fwd_neigh_mac_get(c, &toside->oaddr, omac);
> +
> for (i = 0; i < n; i++)
> - udp_tap_prepare(udp_mh_recv, i, toside, false);
> + udp_tap_prepare(udp_mh_recv, i, omac, toside, false);
>
> tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
> }
> diff --git a/udp.h b/udp.h
> index 8f8531a..dd6e5ad 100644
> --- a/udp.h
> +++ b/udp.h
> @@ -21,7 +21,7 @@ int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
> const char *ifname, in_port_t port);
> int udp_init(struct ctx *c);
> void udp_timer(struct ctx *c, const struct timespec *now);
> -void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
> +void udp_update_l2_buf(const unsigned char *eth_d);
>
> /**
> * union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets
--
Stefano
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 6/9] udp: forward external source MAC address through tap interface
2025-09-27 19:25 ` [PATCH v11 6/9] udp: forward external source MAC address through tap interface Jon Maloy
2025-09-30 21:29 ` Stefano Brivio
@ 2025-09-30 21:29 ` Stefano Brivio
1 sibling, 0 replies; 27+ messages in thread
From: Stefano Brivio @ 2025-09-30 21:29 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
On Sat, 27 Sep 2025 15:25:19 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> We forward the incoming MAC address through the tap interface when
> receiving incoming packets from network local hosts.
>
> This is a part of the solution to bug
> https://bugs.passt.top/show_bug.cgi?id=120
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> ---
> v3: - Adapted to the move of external MAC address from struct flowside
> to struct flow_common
> v4: - Changed signature of udp_tap_prepare() to take a MAC address
> instead of a flow.
> - Eliminated initialization of MAC source address in all frames,
> since those now are set per send occasion anyway.
> v5: - Added lookup in ARP/NDP table on incoming messages in
> case flow->tap_omac wasn't initialized at flow creation,
> i.e., the flow was initiated from the guest.
> v6: - Using MAC_ZERO
> ---
> passt.c | 2 +-
> udp.c | 45 +++++++++++++++++++++++++--------------------
> udp.h | 2 +-
> 3 files changed, 27 insertions(+), 22 deletions(-)
>
> diff --git a/passt.c b/passt.c
> index e20bbad..fdd275a 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -164,7 +164,7 @@ static void timer_init(struct ctx *c, const struct timespec *now)
> void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> {
> tcp_update_l2_buf(eth_d, eth_s);
> - udp_update_l2_buf(eth_d, eth_s);
> + udp_update_l2_buf(eth_d);
> }
>
> /**
> diff --git a/udp.c b/udp.c
> index 86585b7..eb57f05 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -133,11 +133,8 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS];
> /* UDP header and data for inbound messages */
> static struct udp_payload_t udp_payload[UDP_MAX_FRAMES];
>
> -/* Ethernet header for IPv4 frames */
> -static struct ethhdr udp4_eth_hdr;
> -
> -/* Ethernet header for IPv6 frames */
> -static struct ethhdr udp6_eth_hdr;
> +/* Ethernet headers for IPv4 and IPv6 frames */
> +static struct ethhdr udp_eth_hdr[UDP_MAX_FRAMES];
>
> /**
> * struct udp_meta_t - Pre-cooked headers for UDP packets
> @@ -210,12 +207,13 @@ void udp_portmap_clear(void)
> /**
> * udp_update_l2_buf() - Update L2 buffers with Ethernet and IPv4 addresses
> * @eth_d: Ethernet destination address, NULL if unchanged
> - * @eth_s: Ethernet source address, NULL if unchanged
> */
> -void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> +void udp_update_l2_buf(const unsigned char *eth_d)
> {
> - eth_update_mac(&udp4_eth_hdr, eth_d, eth_s);
> - eth_update_mac(&udp6_eth_hdr, eth_d, eth_s);
> + int i;
> +
> + for (i = 0; i < UDP_MAX_FRAMES; i++)
> + eth_update_mac(&udp_eth_hdr[i], eth_d, NULL);
> }
>
> /**
> @@ -238,6 +236,7 @@ static void udp_iov_init_one(const struct ctx *c, size_t i)
>
> *siov = IOV_OF_LVALUE(payload->data);
>
> + tiov[UDP_IOV_ETH] = IOV_OF_LVALUE(udp_eth_hdr[i]);
> tiov[UDP_IOV_TAP] = tap_hdr_iov(c, &meta->taph);
> tiov[UDP_IOV_PAYLOAD].iov_base = payload;
>
> @@ -253,9 +252,6 @@ static void udp_iov_init(const struct ctx *c)
> {
> size_t i;
>
> - udp4_eth_hdr.h_proto = htons_constant(ETH_P_IP);
> - udp6_eth_hdr.h_proto = htons_constant(ETH_P_IPV6);
> -
> for (i = 0; i < UDP_MAX_FRAMES; i++)
> udp_iov_init_one(c, i);
> }
> @@ -352,31 +348,34 @@ size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
> * udp_tap_prepare() - Convert one datagram into a tap frame
> * @mmh: Receiving mmsghdr array
> * @idx: Index of the datagram to prepare
> + * @tap_omac: MAC address of remote endpoint as seen from the guest
> * @toside: Flowside for destination side
> * @no_udp_csum: Do not set UDP checksum
> */
> static void udp_tap_prepare(const struct mmsghdr *mmh,
> - unsigned idx, const struct flowside *toside,
> + unsigned int idx,
> + const uint8_t *tap_omac,
> + const struct flowside *toside,
> bool no_udp_csum)
> {
> struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx];
> struct udp_payload_t *bp = &udp_payload[idx];
> struct udp_meta_t *bm = &udp_meta[idx];
> + struct ethhdr *eh = (*tap_iov)[UDP_IOV_ETH].iov_base;
> size_t l4len;
>
> + eth_update_mac(eh, NULL, tap_omac);
> if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) {
> l4len = udp_update_hdr6(&bm->ip6h, bp, toside,
> mmh[idx].msg_len, no_udp_csum);
> - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) +
> - sizeof(udp6_eth_hdr));
> - (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr);
> + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + ETH_HLEN);
> + eh->h_proto = htons_constant(ETH_P_IPV6);
> (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h);
> } else {
> l4len = udp_update_hdr4(&bm->ip4h, bp, toside,
> mmh[idx].msg_len, no_udp_csum);
> - tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) +
> - sizeof(udp4_eth_hdr));
> - (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr);
> + tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + ETH_HLEN);
> + eh->h_proto = htons_constant(ETH_P_IP);
> (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip4h);
> }
> (*tap_iov)[UDP_IOV_PAYLOAD].iov_len = l4len;
> @@ -801,13 +800,19 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
> flow_sidx_t tosidx)
> {
> const struct flowside *toside = flowside_at_sidx(tosidx);
> + struct udp_flow *uflow = udp_at_sidx(tosidx);
> + uint8_t *omac = uflow->f.tap_omac;
> int i;
>
> if ((n = udp_sock_recv(c, s, udp_mh_recv, n)) <= 0)
> return;
>
> + /* Make one attempt to find true MAC address in ARP/NDP table */
Same as for 4/9: consider using "recorded"?
> + if (MAC_IS_ZERO(omac))
> + fwd_neigh_mac_get(c, &toside->oaddr, omac);
> +
> for (i = 0; i < n; i++)
> - udp_tap_prepare(udp_mh_recv, i, toside, false);
> + udp_tap_prepare(udp_mh_recv, i, omac, toside, false);
>
> tap_send_frames(c, &udp_l2_iov[0][0], UDP_NUM_IOVS, n);
> }
> diff --git a/udp.h b/udp.h
> index 8f8531a..dd6e5ad 100644
> --- a/udp.h
> +++ b/udp.h
> @@ -21,7 +21,7 @@ int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr,
> const char *ifname, in_port_t port);
> int udp_init(struct ctx *c);
> void udp_timer(struct ctx *c, const struct timespec *now);
> -void udp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
> +void udp_update_l2_buf(const unsigned char *eth_d);
>
> /**
> * union udp_listen_epoll_ref - epoll reference for "listening" UDP sockets
--
Stefano
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v11 7/9] tcp: forward external source MAC address through tap interface
2025-09-27 19:25 [PATCH v11 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
` (5 preceding siblings ...)
2025-09-27 19:25 ` [PATCH v11 6/9] udp: forward external source MAC address through tap interface Jon Maloy
@ 2025-09-27 19:25 ` Jon Maloy
2025-09-30 21:30 ` Stefano Brivio
2025-09-27 19:25 ` [PATCH v11 8/9] tap: change signature of function tap_push_l2h() Jon Maloy
2025-09-27 19:25 ` [PATCH v11 9/9] icmp: let icmp use mac address from flowside structure Jon Maloy
8 siblings, 1 reply; 27+ messages in thread
From: Jon Maloy @ 2025-09-27 19:25 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
We forward the incoming mac address through the tap interface when
receiving incoming packets from network local hosts.
This is a part of the solution to bug
https://bugs.passt.top/show_bug.cgi?id=120
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
v3: - Adapted to the signature change in nl_mac_get() function, so that
we now consider only the template interface when checking the
ARP/NDP table.
v4: - Adapted to previous name changes in this series
v5: - Added lookup in ARP/NDP cache and/or table on incoming messages
in case flow->tap_omac wasn't initialized at flow creation,
i.e., the flow was initiated from the guest.
v6: - Using MAC_IS_ZERO() instead of mac_undefined()
- I ignored David's suggestion to move the test for
undefined flow->tap_omac to tcp_{buf,vu}_data_from_sock(), at
least for now. It looks like a sub-optimization to first have to
look up the flow instance in these calls, and then do the
test for MAC_IS_ZERO(), even though we might in theory end up
with fewer such calls (how often?).
---
passt.c | 7 +++----
passt.h | 3 +--
pasta.c | 2 +-
tap.c | 2 +-
tcp.c | 13 +++++++++++--
tcp.h | 2 +-
tcp_buf.c | 37 +++++++++++++++++--------------------
tcp_internal.h | 4 ++--
tcp_vu.c | 5 ++---
9 files changed, 39 insertions(+), 36 deletions(-)
diff --git a/passt.c b/passt.c
index fdd275a..a2ce2d9 100644
--- a/passt.c
+++ b/passt.c
@@ -159,11 +159,10 @@ static void timer_init(struct ctx *c, const struct timespec *now)
/**
* proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void proto_update_l2_buf(const unsigned char *eth_d)
{
- tcp_update_l2_buf(eth_d, eth_s);
+ tcp_update_l2_buf(eth_d);
udp_update_l2_buf(eth_d);
}
@@ -314,7 +313,7 @@ int main(int argc, char **argv)
if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
_exit(EXIT_FAILURE);
- proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
+ proto_update_l2_buf(c.guest_mac);
if (c.ifi4 && !c.no_dhcp)
dhcp_init();
diff --git a/passt.h b/passt.h
index 0075eb4..ff0236c 100644
--- a/passt.h
+++ b/passt.h
@@ -326,7 +326,6 @@ struct ctx {
bool migrate_exit;
};
-void proto_update_l2_buf(const unsigned char *eth_d,
- const unsigned char *eth_s);
+void proto_update_l2_buf(const unsigned char *eth_d);
#endif /* PASST_H */
diff --git a/pasta.c b/pasta.c
index 687406b..a42cfd8 100644
--- a/pasta.c
+++ b/pasta.c
@@ -411,7 +411,7 @@ void pasta_ns_conf(struct ctx *c)
}
}
- proto_update_l2_buf(c->guest_mac, NULL);
+ proto_update_l2_buf(c->guest_mac);
}
/**
diff --git a/tap.c b/tap.c
index 399eeaa..250a0f6 100644
--- a/tap.c
+++ b/tap.c
@@ -1101,7 +1101,7 @@ void tap_add_packet(struct ctx *c, struct iov_tail *data,
memcpy(c->guest_mac, eh->h_source, ETH_ALEN);
debug("New guest MAC address observed: %s",
eth_ntop(c->guest_mac, bufmac, sizeof(bufmac)));
- proto_update_l2_buf(c->guest_mac, NULL);
+ proto_update_l2_buf(c->guest_mac);
}
switch (ntohs(eh->h_proto)) {
diff --git a/tcp.c b/tcp.c
index 48b1ef2..af05d35 100644
--- a/tcp.c
+++ b/tcp.c
@@ -919,6 +919,7 @@ static void tcp_fill_header(struct tcphdr *th,
/**
* tcp_fill_headers() - Fill 802.3, IP, TCP headers
+ * @c: Execution context
* @conn: Connection pointer
* @taph: tap backend specific header
* @ip4h: Pointer to IPv4 header, or NULL
@@ -929,14 +930,15 @@ static void tcp_fill_header(struct tcphdr *th,
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*/
-void tcp_fill_headers(const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
+void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+ struct tap_hdr *taph, struct ethhdr *eh,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
{
const struct flowside *tapside = TAPFLOW(conn);
size_t l4len = iov_tail_size(payload) + sizeof(*th);
+ uint8_t *omac = conn->f.tap_omac;
size_t l3len = l4len;
uint32_t psum = 0;
@@ -962,6 +964,7 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
psum = proto_ipv4_header_psum(l4len, IPPROTO_TCP,
*src4, *dst4);
}
+ eh->h_proto = htons_constant(ETH_P_IP);
}
if (ip6h) {
@@ -982,8 +985,14 @@ void tcp_fill_headers(const struct tcp_tap_conn *conn,
&ip6h->saddr,
&ip6h->daddr);
}
+ eh->h_proto = htons_constant(ETH_P_IPV6);
}
+ /* Make one attempt to find true MAC address in ARP/NDP table */
+ if (MAC_IS_ZERO(omac))
+ fwd_neigh_mac_get(c, &tapside->oaddr, omac);
+ eth_update_mac(eh, NULL, omac);
+
tcp_fill_header(th, conn, seq);
if (no_tcp_csum)
diff --git a/tcp.h b/tcp.h
index 234a803..c1b8385 100644
--- a/tcp.h
+++ b/tcp.h
@@ -24,7 +24,7 @@ int tcp_init(struct ctx *c);
void tcp_timer(struct ctx *c, const struct timespec *now);
void tcp_defer_handler(struct ctx *c);
-void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s);
+void tcp_update_l2_buf(const unsigned char *eth_d);
extern bool peek_offset_cap;
diff --git a/tcp_buf.c b/tcp_buf.c
index a493b5a..f256dcc 100644
--- a/tcp_buf.c
+++ b/tcp_buf.c
@@ -40,8 +40,7 @@
/* Static buffers */
/* Ethernet header for IPv4 and IPv6 frames */
-static struct ethhdr tcp4_eth_src;
-static struct ethhdr tcp6_eth_src;
+static struct ethhdr tcp_eth_hdr[TCP_FRAMES_MEM];
static struct tap_hdr tcp_payload_tap_hdr[TCP_FRAMES_MEM];
@@ -67,12 +66,13 @@ static struct iovec tcp_l2_iov[TCP_FRAMES_MEM][TCP_NUM_IOVS];
/**
* tcp_update_l2_buf() - Update Ethernet header buffers with addresses
* @eth_d: Ethernet destination address, NULL if unchanged
- * @eth_s: Ethernet source address, NULL if unchanged
*/
-void tcp_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
+void tcp_update_l2_buf(const unsigned char *eth_d)
{
- eth_update_mac(&tcp4_eth_src, eth_d, eth_s);
- eth_update_mac(&tcp6_eth_src, eth_d, eth_s);
+ int i;
+
+ for (i = 0; i < TCP_FRAMES_MEM; i++)
+ eth_update_mac(&tcp_eth_hdr[i], eth_d, NULL);
}
/**
@@ -85,9 +85,6 @@ void tcp_sock_iov_init(const struct ctx *c)
struct iphdr iph = L2_BUF_IP4_INIT(IPPROTO_TCP);
int i;
- tcp6_eth_src.h_proto = htons_constant(ETH_P_IPV6);
- tcp4_eth_src.h_proto = htons_constant(ETH_P_IP);
-
for (i = 0; i < ARRAY_SIZE(tcp_payload); i++) {
tcp6_payload_ip[i] = ip6;
tcp4_payload_ip[i] = iph;
@@ -149,13 +146,15 @@ void tcp_payload_flush(const struct ctx *c)
/**
* tcp_l2_buf_fill_headers() - Fill 802.3, IP, TCP headers in pre-cooked buffers
+ * @c: Execution context
* @conn: Connection pointer
* @iov: Pointer to an array of iovec of TCP pre-cooked buffers
* @check: Checksum, if already known
* @seq: Sequence number for this segment
* @no_tcp_csum: Do not set TCP checksum
*/
-static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
+static void tcp_l2_buf_fill_headers(const struct ctx *c,
+ struct tcp_tap_conn *conn,
struct iovec *iov, const uint16_t *check,
uint32_t seq, bool no_tcp_csum)
{
@@ -164,6 +163,7 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
struct tap_hdr *taph = iov[TCP_IOV_TAP].iov_base;
const struct flowside *tapside = TAPFLOW(conn);
const struct in_addr *a4 = inany_v4(&tapside->oaddr);
+ struct ethhdr *eh = iov[TCP_IOV_ETH].iov_base;
struct ipv6hdr *ip6h = NULL;
struct iphdr *ip4h = NULL;
@@ -172,7 +172,7 @@ static void tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn,
else
ip6h = iov[TCP_IOV_IP].iov_base;
- tcp_fill_headers(conn, taph, ip4h, ip6h, th, &tail,
+ tcp_fill_headers(c, conn, taph, eh, ip4h, ip6h, th, &tail,
check, seq, no_tcp_csum);
}
@@ -194,14 +194,12 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
int ret;
iov = tcp_l2_iov[tcp_payload_used];
- if (CONN_V4(conn)) {
+ if (CONN_V4(conn))
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
- } else {
+ else
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
- }
+ iov[TCP_IOV_ETH] = IOV_OF_LVALUE(tcp_eth_hdr[tcp_payload_used]);
payload = iov[TCP_IOV_PAYLOAD].iov_base;
seq = conn->seq_to_tap;
ret = tcp_prepare_flags(c, conn, flags, &payload->th,
@@ -212,7 +210,7 @@ int tcp_buf_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
tcp_frame_conns[tcp_payload_used++] = conn;
l4len = optlen + sizeof(struct tcphdr);
iov[TCP_IOV_PAYLOAD].iov_len = l4len;
- tcp_l2_buf_fill_headers(conn, iov, NULL, seq, false);
+ tcp_l2_buf_fill_headers(c, conn, iov, NULL, seq, false);
if (flags & DUP_ACK) {
struct iovec *dup_iov = tcp_l2_iov[tcp_payload_used];
@@ -260,11 +258,10 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
check = &iph->check;
}
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp4_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp4_eth_src;
} else if (CONN_V6(conn)) {
iov[TCP_IOV_IP] = IOV_OF_LVALUE(tcp6_payload_ip[tcp_payload_used]);
- iov[TCP_IOV_ETH].iov_base = &tcp6_eth_src;
}
+ iov[TCP_IOV_ETH].iov_base = &tcp_eth_hdr[tcp_payload_used];
payload = iov[TCP_IOV_PAYLOAD].iov_base;
payload->th.th_off = sizeof(struct tcphdr) / 4;
payload->th.th_x2 = 0;
@@ -272,7 +269,7 @@ static void tcp_data_to_tap(const struct ctx *c, struct tcp_tap_conn *conn,
payload->th.ack = 1;
payload->th.psh = push;
iov[TCP_IOV_PAYLOAD].iov_len = dlen + sizeof(struct tcphdr);
- tcp_l2_buf_fill_headers(conn, iov, check, seq, false);
+ tcp_l2_buf_fill_headers(c, conn, iov, check, seq, false);
if (++tcp_payload_used > TCP_FRAMES_MEM - 1)
tcp_payload_flush(c);
}
diff --git a/tcp_internal.h b/tcp_internal.h
index 5cb6cba..f55025c 100644
--- a/tcp_internal.h
+++ b/tcp_internal.h
@@ -174,8 +174,8 @@ void tcp_rst_do(const struct ctx *c, struct tcp_tap_conn *conn);
struct tcp_info_linux;
-void tcp_fill_headers(const struct tcp_tap_conn *conn,
- struct tap_hdr *taph,
+void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
+ struct tap_hdr *taph, struct ethhdr *eh,
struct iphdr *ip4h, struct ipv6hdr *ip6h,
struct tcphdr *th, struct iov_tail *payload,
const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum);
diff --git a/tcp_vu.c b/tcp_vu.c
index ebd3a1e..0c17884 100644
--- a/tcp_vu.c
+++ b/tcp_vu.c
@@ -135,7 +135,7 @@ int tcp_vu_send_flag(const struct ctx *c, struct tcp_tap_conn *conn, int flags)
flags_elem[0].in_sg[0].iov_len = hdrlen + optlen;
payload = IOV_TAIL(flags_elem[0].in_sg, 1, hdrlen);
- tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+ tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
NULL, seq, !*c->pcap);
if (*c->pcap) {
@@ -308,7 +308,6 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
eh = vu_eth(base);
memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest));
- memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source));
/* initialize header */
@@ -332,7 +331,7 @@ static void tcp_vu_prepare(const struct ctx *c, struct tcp_tap_conn *conn,
th->ack = 1;
th->psh = push;
- tcp_fill_headers(conn, NULL, ip4h, ip6h, th, &payload,
+ tcp_fill_headers(c, conn, NULL, eh, ip4h, ip6h, th, &payload,
*check, conn->seq_to_tap, no_tcp_csum);
if (ip4h)
*check = &ip4h->check;
--
2.50.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 7/9] tcp: forward external source MAC address through tap interface
2025-09-27 19:25 ` [PATCH v11 7/9] tcp: " Jon Maloy
@ 2025-09-30 21:30 ` Stefano Brivio
0 siblings, 0 replies; 27+ messages in thread
From: Stefano Brivio @ 2025-09-30 21:30 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
On Sat, 27 Sep 2025 15:25:20 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> We forward the incoming mac address through the tap interface when
> receiving incoming packets from network local hosts.
>
> This is a part of the solution to bug
> https://bugs.passt.top/show_bug.cgi?id=120
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> ---
> v3: - Adapted to the signature change in nl_mac_get() function, so that
> we now consider only the template interface when checking the
> ARP/NDP table.
> v4: - Adapted to previous name changes in this series
> v5: - Added lookup in ARP/NDP cache and/or table on incoming messages
> in case flow->tap_omac wasn't initialized at flow creation,
> i.e., the flow was initiated from the guest.
> v6: - Using MAC_IS_ZERO() instead of mac_undefined()
> - I ignored David's suggestion to move the test for
> undefined flow->tap_omac to tcp_{buf,vu}_data_from_sock(), at
> least for now. It looks like a sub-optimization to first have to
> look up the flow instance in these calls, and then do the
> test for MAC_IS_ZERO(), even though we might in theory end up
> with fewer such calls (how often?).
> ---
> passt.c | 7 +++----
> passt.h | 3 +--
> pasta.c | 2 +-
> tap.c | 2 +-
> tcp.c | 13 +++++++++++--
> tcp.h | 2 +-
> tcp_buf.c | 37 +++++++++++++++++--------------------
> tcp_internal.h | 4 ++--
> tcp_vu.c | 5 ++---
> 9 files changed, 39 insertions(+), 36 deletions(-)
>
> diff --git a/passt.c b/passt.c
> index fdd275a..a2ce2d9 100644
> --- a/passt.c
> +++ b/passt.c
> @@ -159,11 +159,10 @@ static void timer_init(struct ctx *c, const struct timespec *now)
> /**
> * proto_update_l2_buf() - Update scatter-gather L2 buffers in protocol handlers
> * @eth_d: Ethernet destination address, NULL if unchanged
> - * @eth_s: Ethernet source address, NULL if unchanged
> */
> -void proto_update_l2_buf(const unsigned char *eth_d, const unsigned char *eth_s)
> +void proto_update_l2_buf(const unsigned char *eth_d)
> {
> - tcp_update_l2_buf(eth_d, eth_s);
> + tcp_update_l2_buf(eth_d);
> udp_update_l2_buf(eth_d);
> }
>
> @@ -314,7 +313,7 @@ int main(int argc, char **argv)
> if ((!c.no_udp && udp_init(&c)) || (!c.no_tcp && tcp_init(&c)))
> _exit(EXIT_FAILURE);
>
> - proto_update_l2_buf(c.guest_mac, c.our_tap_mac);
> + proto_update_l2_buf(c.guest_mac);
>
> if (c.ifi4 && !c.no_dhcp)
> dhcp_init();
> diff --git a/passt.h b/passt.h
> index 0075eb4..ff0236c 100644
> --- a/passt.h
> +++ b/passt.h
> @@ -326,7 +326,6 @@ struct ctx {
> bool migrate_exit;
> };
>
> -void proto_update_l2_buf(const unsigned char *eth_d,
> - const unsigned char *eth_s);
> +void proto_update_l2_buf(const unsigned char *eth_d);
>
> #endif /* PASST_H */
> diff --git a/pasta.c b/pasta.c
> index 687406b..a42cfd8 100644
> --- a/pasta.c
> +++ b/pasta.c
> @@ -411,7 +411,7 @@ void pasta_ns_conf(struct ctx *c)
> }
> }
>
> - proto_update_l2_buf(c->guest_mac, NULL);
> + proto_update_l2_buf(c->guest_mac);
> }
>
> /**
> diff --git a/tap.c b/tap.c
> index 399eeaa..250a0f6 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -1101,7 +1101,7 @@ void tap_add_packet(struct ctx *c, struct iov_tail *data,
> memcpy(c->guest_mac, eh->h_source, ETH_ALEN);
> debug("New guest MAC address observed: %s",
> eth_ntop(c->guest_mac, bufmac, sizeof(bufmac)));
> - proto_update_l2_buf(c->guest_mac, NULL);
> + proto_update_l2_buf(c->guest_mac);
> }
>
> switch (ntohs(eh->h_proto)) {
> diff --git a/tcp.c b/tcp.c
> index 48b1ef2..af05d35 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -919,6 +919,7 @@ static void tcp_fill_header(struct tcphdr *th,
>
> /**
> * tcp_fill_headers() - Fill 802.3, IP, TCP headers
> + * @c: Execution context
> * @conn: Connection pointer
> * @taph: tap backend specific header
> * @ip4h: Pointer to IPv4 header, or NULL
> @@ -929,14 +930,15 @@ static void tcp_fill_header(struct tcphdr *th,
> * @seq: Sequence number for this segment
> * @no_tcp_csum: Do not set TCP checksum
> */
> -void tcp_fill_headers(const struct tcp_tap_conn *conn,
> - struct tap_hdr *taph,
> +void tcp_fill_headers(const struct ctx *c, struct tcp_tap_conn *conn,
> + struct tap_hdr *taph, struct ethhdr *eh,
You now pass a 802.3 header here, but the function documentation isn't
updated accordingly.
> struct iphdr *ip4h, struct ipv6hdr *ip6h,
> struct tcphdr *th, struct iov_tail *payload,
> const uint16_t *ip4_check, uint32_t seq, bool no_tcp_csum)
--
Stefano
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v11 8/9] tap: change signature of function tap_push_l2h()
2025-09-27 19:25 [PATCH v11 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
` (6 preceding siblings ...)
2025-09-27 19:25 ` [PATCH v11 7/9] tcp: " Jon Maloy
@ 2025-09-27 19:25 ` Jon Maloy
2025-09-30 21:30 ` Stefano Brivio
2025-09-27 19:25 ` [PATCH v11 9/9] icmp: let icmp use mac address from flowside structure Jon Maloy
8 siblings, 1 reply; 27+ messages in thread
From: Jon Maloy @ 2025-09-27 19:25 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
In the next commit it must be possible for the callers of function
tap_push_l2h() to specify which source MAC address should be
added to the ethernet header sent over the tap interface. As a
preparation, we now add a new argument to that function, still
without any logical changes.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
v3: -Improved comment for src_mac argument, as suggested by Stefano.
v4: -Re-added comment in tap.c and remove redundant argument check
in tap_push_l2()
v5: -Ensured that MAC argument to tap_push_l2h() never is NULL.
v6: -Clarification of comment about ARP lookup
---
tap.c | 16 +++++++++-------
tap.h | 3 ++-
tcp.c | 5 +++--
3 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/tap.c b/tap.c
index 250a0f6..ec7e525 100644
--- a/tap.c
+++ b/tap.c
@@ -171,17 +171,19 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
* tap_push_l2h() - Build an L2 header for an inbound packet
* @c: Execution context
* @buf: Buffer address at which to generate header
+ * @src_mac: MAC address to be used as source for message.
* @proto: Ethernet protocol number for L3
*
* Return: pointer at which to write the packet's payload
*/
-void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
+void *tap_push_l2h(const struct ctx *c, void *buf,
+ const void *src_mac, uint16_t proto)
{
struct ethhdr *eh = (struct ethhdr *)buf;
- /* TODO: ARP table lookup */
+ /* TODO: ARP lookup on tap side */
memcpy(eh->h_dest, c->guest_mac, ETH_ALEN);
- memcpy(eh->h_source, c->our_tap_mac, ETH_ALEN);
+ memcpy(eh->h_source, src_mac, ETH_ALEN);
eh->h_proto = ntohs(proto);
return eh + 1;
}
@@ -261,7 +263,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
{
size_t l4len = dlen + sizeof(struct udphdr);
char buf[USHRT_MAX];
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
@@ -281,7 +283,7 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
const void *in, size_t l4len)
{
char buf[USHRT_MAX];
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst,
l4len, IPPROTO_ICMP);
@@ -367,7 +369,7 @@ void tap_udp6_send(const struct ctx *c,
{
size_t l4len = dlen + sizeof(struct udphdr);
char buf[USHRT_MAX];
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
l4len, IPPROTO_UDP, flow);
char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
@@ -389,7 +391,7 @@ void tap_icmp6_send(const struct ctx *c,
const void *in, size_t l4len)
{
char buf[USHRT_MAX];
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, l4len,
IPPROTO_ICMPV6, 0);
diff --git a/tap.h b/tap.h
index 21db4d2..02f7761 100644
--- a/tap.h
+++ b/tap.h
@@ -70,7 +70,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
}
unsigned long tap_l2_max_len(const struct ctx *c);
-void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
+void *tap_push_l2h(const struct ctx *c, void *buf,
+ const void *src_mac, uint16_t proto);
void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
struct in_addr dst, size_t l4len, uint8_t proto);
void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
diff --git a/tcp.c b/tcp.c
index af05d35..b874317 100644
--- a/tcp.c
+++ b/tcp.c
@@ -1978,7 +1978,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
return;
if (af == AF_INET) {
- struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
+ ETH_P_IP);
const struct in_addr *rst_src = daddr;
const struct in_addr *rst_dst = saddr;
@@ -1988,7 +1989,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
*rst_src, *rst_dst);
} else {
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
const struct in6_addr *rst_src = daddr;
const struct in6_addr *rst_dst = saddr;
--
2.50.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 8/9] tap: change signature of function tap_push_l2h()
2025-09-27 19:25 ` [PATCH v11 8/9] tap: change signature of function tap_push_l2h() Jon Maloy
@ 2025-09-30 21:30 ` Stefano Brivio
2025-10-01 0:23 ` David Gibson
0 siblings, 1 reply; 27+ messages in thread
From: Stefano Brivio @ 2025-09-30 21:30 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
On Sat, 27 Sep 2025 15:25:21 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> In the next commit it must be possible for the callers of function
> tap_push_l2h() to specify which source MAC address should be
> added to the ethernet header sent over the tap interface. As a
> preparation, we now add a new argument to that function, still
> without any logical changes.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> ---
> v3: -Improved comment for src_mac argument, as suggested by Stefano.
> v4: -Re-added comment in tap.c and remove redundant argument check
> in tap_push_l2()
> v5: -Ensured that MAC argument to tap_push_l2h() never is NULL.
> v6: -Clarification of comment about ARP lookup
> ---
> tap.c | 16 +++++++++-------
> tap.h | 3 ++-
> tcp.c | 5 +++--
> 3 files changed, 14 insertions(+), 10 deletions(-)
>
> diff --git a/tap.c b/tap.c
> index 250a0f6..ec7e525 100644
> --- a/tap.c
> +++ b/tap.c
> @@ -171,17 +171,19 @@ const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
> * tap_push_l2h() - Build an L2 header for an inbound packet
> * @c: Execution context
> * @buf: Buffer address at which to generate header
> + * @src_mac: MAC address to be used as source for message.
Trailing . which we never use.
> * @proto: Ethernet protocol number for L3
> *
> * Return: pointer at which to write the packet's payload
> */
> -void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
> +void *tap_push_l2h(const struct ctx *c, void *buf,
> + const void *src_mac, uint16_t proto)
> {
> struct ethhdr *eh = (struct ethhdr *)buf;
>
> - /* TODO: ARP table lookup */
> + /* TODO: ARP lookup on tap side */
Why? I mean, what would we do with guest-side MAC addresses, as we
don't control Layer-2 host-side?
> memcpy(eh->h_dest, c->guest_mac, ETH_ALEN);
> - memcpy(eh->h_source, c->our_tap_mac, ETH_ALEN);
> + memcpy(eh->h_source, src_mac, ETH_ALEN);
> eh->h_proto = ntohs(proto);
> return eh + 1;
> }
> @@ -261,7 +263,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
> {
> size_t l4len = dlen + sizeof(struct udphdr);
> char buf[USHRT_MAX];
> - struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
> + struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
> struct udphdr *uh = tap_push_ip4h(ip4h, src, dst, l4len, IPPROTO_UDP);
> char *data = tap_push_uh4(uh, src, sport, dst, dport, in, dlen);
>
> @@ -281,7 +283,7 @@ void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
> const void *in, size_t l4len)
> {
> char buf[USHRT_MAX];
> - struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
> + struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
> struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst,
> l4len, IPPROTO_ICMP);
>
> @@ -367,7 +369,7 @@ void tap_udp6_send(const struct ctx *c,
> {
> size_t l4len = dlen + sizeof(struct udphdr);
> char buf[USHRT_MAX];
> - struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
> + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
> struct udphdr *uh = tap_push_ip6h(ip6h, src, dst,
> l4len, IPPROTO_UDP, flow);
> char *data = tap_push_uh6(uh, src, sport, dst, dport, in, dlen);
> @@ -389,7 +391,7 @@ void tap_icmp6_send(const struct ctx *c,
> const void *in, size_t l4len)
> {
> char buf[USHRT_MAX];
> - struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
> + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
> struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, l4len,
> IPPROTO_ICMPV6, 0);
>
> diff --git a/tap.h b/tap.h
> index 21db4d2..02f7761 100644
> --- a/tap.h
> +++ b/tap.h
> @@ -70,7 +70,8 @@ static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len)
> }
>
> unsigned long tap_l2_max_len(const struct ctx *c);
> -void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto);
> +void *tap_push_l2h(const struct ctx *c, void *buf,
> + const void *src_mac, uint16_t proto);
> void *tap_push_ip4h(struct iphdr *ip4h, struct in_addr src,
> struct in_addr dst, size_t l4len, uint8_t proto);
> void *tap_push_uh4(struct udphdr *uh, struct in_addr src, in_port_t sport,
> diff --git a/tcp.c b/tcp.c
> index af05d35..b874317 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1978,7 +1978,8 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
> return;
>
> if (af == AF_INET) {
> - struct iphdr *ip4h = tap_push_l2h(c, buf, ETH_P_IP);
> + struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac,
> + ETH_P_IP);
> const struct in_addr *rst_src = daddr;
> const struct in_addr *rst_dst = saddr;
>
> @@ -1988,7 +1989,7 @@ static void tcp_rst_no_conn(const struct ctx *c, int af,
> *rst_src, *rst_dst);
>
> } else {
> - struct ipv6hdr *ip6h = tap_push_l2h(c, buf, ETH_P_IPV6);
> + struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
This can and should be wrapped to 80 columns.
> const struct in6_addr *rst_src = daddr;
> const struct in6_addr *rst_dst = saddr;
>
--
Stefano
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 8/9] tap: change signature of function tap_push_l2h()
2025-09-30 21:30 ` Stefano Brivio
@ 2025-10-01 0:23 ` David Gibson
0 siblings, 0 replies; 27+ messages in thread
From: David Gibson @ 2025-10-01 0:23 UTC (permalink / raw)
To: Stefano Brivio; +Cc: Jon Maloy, dgibson, passt-dev
[-- Attachment #1: Type: text/plain, Size: 1095 bytes --]
On Tue, Sep 30, 2025 at 11:30:15PM +0200, Stefano Brivio wrote:
> On Sat, 27 Sep 2025 15:25:21 -0400
> Jon Maloy <jmaloy@redhat.com> wrote:
[snip]
> > -void *tap_push_l2h(const struct ctx *c, void *buf, uint16_t proto)
> > +void *tap_push_l2h(const struct ctx *c, void *buf,
> > + const void *src_mac, uint16_t proto)
> > {
> > struct ethhdr *eh = (struct ethhdr *)buf;
> >
> > - /* TODO: ARP table lookup */
> > + /* TODO: ARP lookup on tap side */
>
> Why? I mean, what would we do with guest-side MAC addresses, as we
> don't control Layer-2 host-side?
I actually have some loose plans in this area. We could repurpose the
ARP cache data structure to track MAC addresses on the guest side.
That would be one piece we'd need in order to handle multiple bridged
guests simultaneously. Obviously it would be populated by different
means than the host side cache.
--
David Gibson (he or they) | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au | minimalist, thank you, not the other way
| around.
http://www.ozlabs.org/~dgibson
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v11 9/9] icmp: let icmp use mac address from flowside structure
2025-09-27 19:25 [PATCH v11 0/9] Use true MAC address of LAN local remote hosts Jon Maloy
` (7 preceding siblings ...)
2025-09-27 19:25 ` [PATCH v11 8/9] tap: change signature of function tap_push_l2h() Jon Maloy
@ 2025-09-27 19:25 ` Jon Maloy
2025-09-30 21:30 ` Stefano Brivio
8 siblings, 1 reply; 27+ messages in thread
From: Jon Maloy @ 2025-09-27 19:25 UTC (permalink / raw)
To: sbrivio, dgibson, david, jmaloy, passt-dev
Even ICMP needs to be updated to use the external MAC address instead
of just the own tap address when applicable. We do that here.
Signed-off-by: Jon Maloy <jmaloy@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
v3: - Adapted to the move of external MAC address from struct flowside
to struct flow_common
v4: - Adapted to name changes in previous commits in this series
v5: - Added conditional lookup in ARP/NDP if the flow's tap_omac is
undefined
v6: - Looking up MAC of ICMP generating node in udp_send_tap_icmp4/6()
when available, instead trusting the contents of flow->tap_omac.
---
icmp.c | 8 ++++++--
ndp.c | 2 +-
tap.c | 10 ++++++----
tap.h | 4 ++--
udp.c | 12 ++++++++++--
5 files changed, 25 insertions(+), 11 deletions(-)
diff --git a/icmp.c b/icmp.c
index 6dffafb..1d99632 100644
--- a/icmp.c
+++ b/icmp.c
@@ -125,17 +125,21 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
flow_dbg(pingf, "echo reply to tap, ID: %"PRIu16", seq: %"PRIu16,
ini->eport, seq);
+ /* Try to find true MAC address in ARP/NDP table if needed */
+ if (MAC_IS_ZERO(pingf->f.tap_omac))
+ fwd_neigh_mac_get(c, &ini->oaddr, pingf->f.tap_omac);
+
if (pingf->f.type == FLOW_PING4) {
const struct in_addr *saddr = inany_v4(&ini->oaddr);
const struct in_addr *daddr = inany_v4(&ini->eaddr);
ASSERT(saddr && daddr); /* Must have IPv4 addresses */
- tap_icmp4_send(c, *saddr, *daddr, buf, n);
+ tap_icmp4_send(c, *saddr, *daddr, buf, pingf->f.tap_omac, n);
} else if (pingf->f.type == FLOW_PING6) {
const struct in6_addr *saddr = &ini->oaddr.a6;
const struct in6_addr *daddr = &ini->eaddr.a6;
- tap_icmp6_send(c, saddr, daddr, buf, n);
+ tap_icmp6_send(c, saddr, daddr, buf, pingf->f.tap_omac, n);
}
return;
diff --git a/ndp.c b/ndp.c
index 610a06c..5568787 100644
--- a/ndp.c
+++ b/ndp.c
@@ -184,7 +184,7 @@ static void ndp_send(const struct ctx *c, const struct in6_addr *dst,
{
const struct in6_addr *src = &c->ip6.our_tap_ll;
- tap_icmp6_send(c, src, dst, buf, l4len);
+ tap_icmp6_send(c, src, dst, buf, c->our_tap_mac, l4len);
}
/**
diff --git a/tap.c b/tap.c
index ec7e525..2a32da5 100644
--- a/tap.c
+++ b/tap.c
@@ -277,13 +277,14 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
* @src: IPv4 source address
* @dst: IPv4 destination address
* @in: ICMP packet, including ICMP header
+ * @src_mac: MAC address to be used as source for message
* @l4len: ICMP packet length, including ICMP header
*/
void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
- const void *in, size_t l4len)
+ const void *in, const void *src_mac, size_t l4len)
{
char buf[USHRT_MAX];
- struct iphdr *ip4h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IP);
+ struct iphdr *ip4h = tap_push_l2h(c, buf, src_mac, ETH_P_IP);
struct icmphdr *icmp4h = tap_push_ip4h(ip4h, src, dst,
l4len, IPPROTO_ICMP);
@@ -384,14 +385,15 @@ void tap_udp6_send(const struct ctx *c,
* @src: IPv6 source address
* @dst: IPv6 destination address
* @in: ICMP packet, including ICMP header
+ * @src_mac: MAC address to be used as source for message
* @l4len: ICMP packet length, including ICMP header
*/
void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst,
- const void *in, size_t l4len)
+ const void *in, const void *src_mac, size_t l4len)
{
char buf[USHRT_MAX];
- struct ipv6hdr *ip6h = tap_push_l2h(c, buf, c->our_tap_mac, ETH_P_IPV6);
+ struct ipv6hdr *ip6h = tap_push_l2h(c, buf, src_mac, ETH_P_IPV6);
struct icmp6hdr *icmp6h = tap_push_ip6h(ip6h, src, dst, l4len,
IPPROTO_ICMPV6, 0);
diff --git a/tap.h b/tap.h
index 02f7761..1864173 100644
--- a/tap.h
+++ b/tap.h
@@ -91,7 +91,7 @@ void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport,
struct in_addr dst, in_port_t dport,
const void *in, size_t dlen);
void tap_icmp4_send(const struct ctx *c, struct in_addr src, struct in_addr dst,
- const void *in, size_t l4len);
+ const void *in, const void *src_mac, size_t l4len);
const struct in6_addr *tap_ip6_daddr(const struct ctx *c,
const struct in6_addr *src);
void *tap_push_ip6h(struct ipv6hdr *ip6h,
@@ -103,7 +103,7 @@ void tap_udp6_send(const struct ctx *c,
uint32_t flow, void *in, size_t dlen);
void tap_icmp6_send(const struct ctx *c,
const struct in6_addr *src, const struct in6_addr *dst,
- const void *in, size_t l4len);
+ const void *in, const void *src_mac, size_t l4len);
void tap_send_single(const struct ctx *c, const void *data, size_t l2len);
size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
size_t bufs_per_frame, size_t nframes);
diff --git a/udp.c b/udp.c
index eb57f05..ff15e37 100644
--- a/udp.c
+++ b/udp.c
@@ -400,6 +400,8 @@ static void udp_send_tap_icmp4(const struct ctx *c,
struct in_addr eaddr = toside->eaddr.v4mapped.a4;
in_port_t eport = toside->eport;
in_port_t oport = toside->oport;
+ union inany_addr saddr_any;
+ uint8_t tap_omac[ETH_ALEN];
struct {
struct icmphdr icmp4h;
struct iphdr ip4h;
@@ -421,7 +423,10 @@ static void udp_send_tap_icmp4(const struct ctx *c,
tap_push_uh4(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
memcpy(&msg.data, in, dlen);
- tap_icmp4_send(c, saddr, eaddr, &msg, msglen);
+ /* Try to obtain the MAC address of the generating node */
+ saddr_any = inany_from_v4(saddr);
+ fwd_neigh_mac_get(c, &saddr_any, tap_omac);
+ tap_icmp4_send(c, saddr, eaddr, &msg, tap_omac, msglen);
}
@@ -445,6 +450,7 @@ static void udp_send_tap_icmp6(const struct ctx *c,
const struct in6_addr *eaddr = &toside->eaddr.a6;
in_port_t eport = toside->eport;
in_port_t oport = toside->oport;
+ uint8_t tap_omac[ETH_ALEN];
struct {
struct icmp6_hdr icmp6h;
struct ipv6hdr ip6h;
@@ -466,7 +472,9 @@ static void udp_send_tap_icmp6(const struct ctx *c,
tap_push_uh6(&msg.uh, eaddr, eport, oaddr, oport, in, dlen);
memcpy(&msg.data, in, dlen);
- tap_icmp6_send(c, saddr, eaddr, &msg, msglen);
+ /* Try to obtain the MAC address of the generating node */
+ fwd_neigh_mac_get(c, (union inany_addr *) saddr, tap_omac);
+ tap_icmp6_send(c, saddr, eaddr, &msg, tap_omac, msglen);
}
/**
--
2.50.1
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v11 9/9] icmp: let icmp use mac address from flowside structure
2025-09-27 19:25 ` [PATCH v11 9/9] icmp: let icmp use mac address from flowside structure Jon Maloy
@ 2025-09-30 21:30 ` Stefano Brivio
0 siblings, 0 replies; 27+ messages in thread
From: Stefano Brivio @ 2025-09-30 21:30 UTC (permalink / raw)
To: Jon Maloy; +Cc: dgibson, david, passt-dev
On Sat, 27 Sep 2025 15:25:22 -0400
Jon Maloy <jmaloy@redhat.com> wrote:
> Even ICMP needs to be updated to use the external MAC address instead
> of just the own tap address when applicable. We do that here.
>
> Signed-off-by: Jon Maloy <jmaloy@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>
> ---
> v3: - Adapted to the move of external MAC address from struct flowside
> to struct flow_common
> v4: - Adapted to name changes in previous commits in this series
> v5: - Added conditional lookup in ARP/NDP if the flow's tap_omac is
> undefined
> v6: - Looking up MAC of ICMP generating node in udp_send_tap_icmp4/6()
> when available, instead trusting the contents of flow->tap_omac.
> ---
> icmp.c | 8 ++++++--
> ndp.c | 2 +-
> tap.c | 10 ++++++----
> tap.h | 4 ++--
> udp.c | 12 ++++++++++--
> 5 files changed, 25 insertions(+), 11 deletions(-)
>
> diff --git a/icmp.c b/icmp.c
> index 6dffafb..1d99632 100644
> --- a/icmp.c
> +++ b/icmp.c
> @@ -125,17 +125,21 @@ void icmp_sock_handler(const struct ctx *c, union epoll_ref ref)
> flow_dbg(pingf, "echo reply to tap, ID: %"PRIu16", seq: %"PRIu16,
> ini->eport, seq);
>
> + /* Try to find true MAC address in ARP/NDP table if needed */
Same as for 4/9: consider using "recorded".
Everything else looks good to me! But I haven't tried using my little
hammer on it, yet.
--
Stefano
^ permalink raw reply [flat|nested] 27+ messages in thread