public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Stefano Brivio <sbrivio@redhat.com>
To: David Gibson <david@gibson.dropbear.id.au>
Cc: passt-dev@passt.top
Subject: Re: [PATCH v3 14/14] flow, fwd: Optimise forwarding rule lookup using epoll ref when possible
Date: Tue, 13 Jan 2026 23:13:03 +0100	[thread overview]
Message-ID: <20260113231303.78fb3471@elisabeth> (raw)
In-Reply-To: <20260108022948.2657573-15-david@gibson.dropbear.id.au>

On Thu,  8 Jan 2026 13:29:48 +1100
David Gibson <david@gibson.dropbear.id.au> wrote:

> Now that listening sockets include a reference to the forwarding rule which
> created them we can, in many cases, avoid a linear search of the forwarding
> table when we want to find the relevant rule.  Instead we can take the
> rule index from the socket's epoll reference, and use that to immediately
> find the correct rule.
> 
> This is conceptually simple, but requires a moderate amount of plumbing to
> get the index from the reference through to the rule lookup.  We still
> allow fall back to linear search if we don't have the index, and this may
> (rarely) be used in the udp_flush_flow() case, where we could get packets
> for one flow on a different flow's socket, rather than through a listening
> socket as usual.
> 
> Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>  flow.c         | 13 +++++++++++--
>  flow_table.h   |  2 +-
>  fwd.c          |  3 +--
>  fwd.h          |  1 +
>  icmp.c         |  2 +-
>  tcp.c          |  4 ++--
>  udp.c          | 14 +++++++++-----
>  udp_flow.c     | 14 ++++++++------
>  udp_flow.h     |  2 +-
>  udp_internal.h |  4 ++--
>  10 files changed, 37 insertions(+), 22 deletions(-)
> 
> diff --git a/flow.c b/flow.c
> index 38f88732..85759970 100644
> --- a/flow.c
> +++ b/flow.c
> @@ -482,12 +482,13 @@ struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif,
>   * flow_target() - Determine where flow should forward to, and move to TGT
>   * @c:		Execution context
>   * @flow:	Flow to forward
> + * @rule_hint:	Index of relevant forwarding rule, or -1 if unknown
>   * @proto:	Protocol
>   *
>   * Return: pointer to the target flowside information
>   */
>  struct flowside *flow_target(const struct ctx *c, union flow *flow,
> -			     uint8_t proto)
> +			     int rule_hint, uint8_t proto)
>  {
>  	char estr[INANY_ADDRSTRLEN], ostr[INANY_ADDRSTRLEN];
>  	struct flow_common *f = &flow->f;
> @@ -515,7 +516,15 @@ struct flowside *flow_target(const struct ctx *c, union flow *flow,
>  		else
>  			goto nofwd;
>  
> -		if (!(rule = fwd_rule_search(fwd, ini))) {
> +		if (rule_hint >= 0) {
> +			ASSERT((unsigned)rule_hint < fwd->count);
> +			rule = &fwd->rules[rule_hint];
> +			if (!fwd_rule_match(rule, ini)) {
> +				flow_dbg(flow,
> +					 "Unexpected mismatching forward rule");
> +				goto nofwd;
> +			}
> +		} else if (!(rule = fwd_rule_search(fwd, ini))) {

*If* we need this fwd_rule_search() / fwd_rule_match() / else if
fwd_rule_search() pattern in more places, it might be convenient to
hide the complexity by taking 'rule_hint' as argument for
fwd_rule_search() perhaps, and then do something like this in
fwd_rule_search():

	for (i = hint == -1 ? 0 : hint; i < fwd->count; i++) {
		if (fwd_rule_match(&fwd->rules[i], ini))
			return &fwd->rules[i];
		if (hint)
			break;
	}

	return NULL;

but I haven't really thought this through. It makes fwd_rule_search() a
bit ugly.

>  			/* This shouldn't happen, because if there's no rule for
>  			 * it we should have no listening socket that would let
>  			 * us get here
> diff --git a/flow_table.h b/flow_table.h
> index 5ee13acc..73de13ba 100644
> --- a/flow_table.h
> +++ b/flow_table.h
> @@ -207,7 +207,7 @@ const struct flowside *flow_target_af(union flow *flow, uint8_t pif,
>  				      const void *saddr, in_port_t sport,
>  				      const void *daddr, in_port_t dport);
>  struct flowside *flow_target(const struct ctx *c, union flow *flow,
> -			     uint8_t proto);
> +			     int rule_hint, uint8_t proto);
>  
>  union flow *flow_set_type(union flow *flow, enum flow_type type);
>  #define FLOW_SET_TYPE(flow_, t_, var_)	(&flow_set_type((flow_), (t_))->var_)
> diff --git a/fwd.c b/fwd.c
> index 6727d26f..250b470c 100644
> --- a/fwd.c
> +++ b/fwd.c
> @@ -415,8 +415,7 @@ void fwd_rule_add(struct fwd_ports *fwd, uint8_t flags,
>   *
>   * Returns: true if the rule applies to the flow, false otherwise
>   */
> -static bool fwd_rule_match(const struct fwd_rule *rule,
> -			   const struct flowside *ini)
> +bool fwd_rule_match(const struct fwd_rule *rule, const struct flowside *ini)
>  {
>  	return inany_matches(&ini->oaddr, fwd_rule_addr(rule)) &&
>  		ini->oport >= rule->first && ini->oport <= rule->last;
> diff --git a/fwd.h b/fwd.h
> index 435f422a..2af7791d 100644
> --- a/fwd.h
> +++ b/fwd.h
> @@ -106,6 +106,7 @@ struct fwd_ports {
>  void fwd_rule_add(struct fwd_ports *fwd, uint8_t flags,
>  		  const union inany_addr *addr, const char *ifname,
>  		  in_port_t first, in_port_t last, in_port_t to);
> +bool fwd_rule_match(const struct fwd_rule *rule, const struct flowside *ini);
>  const struct fwd_rule *fwd_rule_search(const struct fwd_ports *fwd,
>  				       const struct flowside *ini);
>  void fwd_rules_print(const struct fwd_ports *fwd);
> diff --git a/icmp.c b/icmp.c
> index 9564c496..0f4d48bb 100644
> --- a/icmp.c
> +++ b/icmp.c
> @@ -183,7 +183,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c,
>  		return NULL;
>  
>  	flow_initiate_af(flow, PIF_TAP, af, saddr, id, daddr, id);
> -	if (!(tgt = flow_target(c, flow, proto)))
> +	if (!(tgt = flow_target(c, flow, -1, proto)))
>  		goto cancel;
>  
>  	if (flow->f.pif[TGTSIDE] != PIF_HOST) {
> diff --git a/tcp.c b/tcp.c
> index fc03e38f..89b22a51 100644
> --- a/tcp.c
> +++ b/tcp.c
> @@ -1657,7 +1657,7 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af,
>  	ini = flow_initiate_af(flow, PIF_TAP,
>  			       af, saddr, srcport, daddr, dstport);
>  
> -	if (!(tgt = flow_target(c, flow, IPPROTO_TCP)))
> +	if (!(tgt = flow_target(c, flow, -1, IPPROTO_TCP)))
>  		goto cancel;
>  
>  	if (flow->f.pif[TGTSIDE] != PIF_HOST) {
> @@ -2496,7 +2496,7 @@ void tcp_listen_handler(const struct ctx *c, union epoll_ref ref,
>  		goto cancel;
>  	}
>  
> -	if (!flow_target(c, flow, IPPROTO_TCP))
> +	if (!flow_target(c, flow, ref.listen.rule, IPPROTO_TCP))
>  		goto cancel;
>  
>  	switch (flow->f.pif[TGTSIDE]) {
> diff --git a/udp.c b/udp.c
> index 761221f6..d7e47e52 100644
> --- a/udp.c
> +++ b/udp.c
> @@ -838,12 +838,13 @@ static void udp_buf_sock_to_tap(const struct ctx *c, int s, int n,
>   * udp_sock_fwd() - Forward datagrams from a possibly unconnected socket
>   * @c:		Execution context
>   * @s:		Socket to forward from
> + * @rule_hint:	Forwarding rule to use, or -1 if unknown
>   * @frompif:	Interface to which @s belongs
>   * @port:	Our (local) port number of @s
>   * @now:	Current timestamp
>   */
> -void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
> -		  in_port_t port, const struct timespec *now)
> +void udp_sock_fwd(const struct ctx *c, int s, int rule_hint,
> +		  uint8_t frompif, in_port_t port, const struct timespec *now)
>  {
>  	union sockaddr_inany src;
>  	union inany_addr dst;
> @@ -868,7 +869,8 @@ void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
>  			continue;
>  		}
>  
> -		tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src, now);
> +		tosidx = udp_flow_from_sock(c, frompif, &dst, port, &src,
> +					    rule_hint, now);
>  		topif = pif_at_sidx(tosidx);
>  
>  		if (pif_is_socket(topif)) {
> @@ -910,8 +912,10 @@ void udp_listen_sock_handler(const struct ctx *c,
>  			     union epoll_ref ref, uint32_t events,
>  			     const struct timespec *now)
>  {
> -	if (events & (EPOLLERR | EPOLLIN))
> -		udp_sock_fwd(c, ref.fd, ref.listen.pif, ref.listen.port, now);
> +	if (events & (EPOLLERR | EPOLLIN)) {
> +		udp_sock_fwd(c, ref.fd, ref.listen.rule,
> +			     ref.listen.pif, ref.listen.port, now);
> +	}
>  }
>  
>  /**
> diff --git a/udp_flow.c b/udp_flow.c
> index 8907f2f7..b82c6c06 100644
> --- a/udp_flow.c
> +++ b/udp_flow.c
> @@ -139,6 +139,7 @@ static int udp_flow_sock(const struct ctx *c,
>   * udp_flow_new() - Common setup for a new UDP flow
>   * @c:		Execution context
>   * @flow:	Initiated flow
> + * @rule_hint:	Index of forwarding rule, or -1 if unknown
>   * @now:	Timestamp
>   *
>   * Return: sidx for the target side of the new UDP flow, or FLOW_SIDX_NONE
> @@ -147,13 +148,13 @@ static int udp_flow_sock(const struct ctx *c,
>   * #syscalls getsockname
>   */
>  static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow,
> -				const struct timespec *now)
> +				int rule_hint, const struct timespec *now)
>  {
>  	struct udp_flow *uflow = NULL;
>  	const struct flowside *tgt;
>  	unsigned sidei;
>  
> -	if (!(tgt = flow_target(c, flow, IPPROTO_UDP)))
> +	if (!(tgt = flow_target(c, flow, rule_hint, IPPROTO_UDP)))
>  		goto cancel;
>  
>  	uflow = FLOW_SET_TYPE(flow, FLOW_UDP, udp);
> @@ -216,6 +217,7 @@ cancel:
>   * @dst:	Our (local) address to which the datagram is arriving
>   * @port:	Our (local) port number to which the datagram is arriving
>   * @s_in:	Source socket address, filled in by recvmmsg()
> + * @rule_hint:	Index of forwarding rule, or -1 if unknown
>   * @now:	Timestamp
>   *
>   * #syscalls fcntl arm:fcntl64 ppc64:fcntl64|fcntl i686:fcntl64
> @@ -226,7 +228,7 @@ cancel:
>  flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
>  			       const union inany_addr *dst, in_port_t port,
>  			       const union sockaddr_inany *s_in,
> -			       const struct timespec *now)
> +			       int rule_hint, const struct timespec *now)
>  {
>  	const struct flowside *ini;
>  	struct udp_flow *uflow;
> @@ -260,7 +262,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
>  		return FLOW_SIDX_NONE;
>  	}
>  
> -	return udp_flow_new(c, flow, now);
> +	return udp_flow_new(c, flow, rule_hint, now);
>  }
>  
>  /**
> @@ -316,7 +318,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c,
>  		return FLOW_SIDX_NONE;
>  	}
>  
> -	return udp_flow_new(c, flow, now);
> +	return udp_flow_new(c, flow, -1, now);
>  }
>  
>  /**
> @@ -332,7 +334,7 @@ static void udp_flush_flow(const struct ctx *c,
>  {
>  	/* We don't know exactly where the datagrams will come from, but we know
>  	 * they'll have an interface and oport matching this flow */
> -	udp_sock_fwd(c, uflow->s[sidei], uflow->f.pif[sidei],
> +	udp_sock_fwd(c, -1, uflow->s[sidei], uflow->f.pif[sidei],

I guess this is fixed on your local branch because this patch (and only
this one) applied with some fuzz, but in case it's not... this passes
-1 as socket ('s' argument), not as 'rule_hint'.

If I swap it with uflow->s[sidei] it all works as expected.

>  		     uflow->f.side[sidei].oport, now);
>  }
>  
> diff --git a/udp_flow.h b/udp_flow.h
> index 4c528e95..14e0f920 100644
> --- a/udp_flow.h
> +++ b/udp_flow.h
> @@ -35,7 +35,7 @@ struct udp_flow *udp_at_sidx(flow_sidx_t sidx);
>  flow_sidx_t udp_flow_from_sock(const struct ctx *c, uint8_t pif,
>  			       const union inany_addr *dst, in_port_t port,
>  			       const union sockaddr_inany *s_in,
> -			       const struct timespec *now);
> +			       int rule_hint, const struct timespec *now);
>  flow_sidx_t udp_flow_from_tap(const struct ctx *c,
>  			      uint8_t pif, sa_family_t af,
>  			      const void *saddr, const void *daddr,
> diff --git a/udp_internal.h b/udp_internal.h
> index 96d11cff..f0ce8f14 100644
> --- a/udp_internal.h
> +++ b/udp_internal.h
> @@ -28,7 +28,7 @@ size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp,
>  size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp,
>                         const struct flowside *toside, size_t dlen,
>  		       bool no_udp_csum);
> -void udp_sock_fwd(const struct ctx *c, int s, uint8_t frompif,
> -		  in_port_t port, const struct timespec *now);
> +void udp_sock_fwd(const struct ctx *c, int s, int rule_hint,
> +		  uint8_t frompif, in_port_t port, const struct timespec *now);
>  
>  #endif /* UDP_INTERNAL_H */

So, uh, having reached this point, I was asking myself "is this really
everything"? :)

Well, it's not exactly everything as you mentioned in the cover letter,
perhaps the biggest missing part (other than client / interface) is
outbound forwarding, but still, reaching this point with just 14
relatively small patches is quite impressive.

I guess it's also merit of how generic / well structured the flow table
and especially the flow "stages" are.

-- 
Stefano


      reply	other threads:[~2026-01-13 22:13 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-01-08  2:29 [PATCH v3 00/14] Introduce forwarding table David Gibson
2026-01-08  2:29 ` [PATCH v3 01/14] inany: Extend inany_ntop() to treat NULL as a fully unspecified address David Gibson
2026-01-08 13:16   ` Laurent Vivier
2026-01-08  2:29 ` [PATCH v3 02/14] conf, fwd: Keep a table of our port forwarding configuration David Gibson
2026-01-12 23:26   ` Stefano Brivio
2026-01-13  5:12     ` David Gibson
2026-01-13 22:13       ` Stefano Brivio
2026-01-13 23:53         ` David Gibson
2026-01-08  2:29 ` [PATCH v3 03/14] conf: Accurately record ifname and address for outbound forwards David Gibson
2026-01-08  2:29 ` [PATCH v3 04/14] conf, fwd: Record "auto" port forwards in forwarding table David Gibson
2026-01-12 23:26   ` Stefano Brivio
2026-01-13  5:14     ` David Gibson
2026-01-08  2:29 ` [PATCH v3 05/14] fwd: Make space to store listening sockets in forward table David Gibson
2026-01-12 23:26   ` Stefano Brivio
2026-01-13  5:28     ` David Gibson
2026-01-13 22:13       ` Stefano Brivio
2026-01-13 23:57         ` David Gibson
2026-01-08  2:29 ` [PATCH v3 06/14] ip: Add ipproto_name() function David Gibson
2026-01-08 13:22   ` Laurent Vivier
2026-01-08 23:12     ` David Gibson
2026-01-08  2:29 ` [PATCH v3 07/14] fwd, tcp, udp: Set up listening sockets based on forward table David Gibson
2026-01-12 23:26   ` Stefano Brivio
2026-01-13  5:38     ` David Gibson
2026-01-13 22:13       ` Stefano Brivio
2026-01-13 23:59         ` David Gibson
2026-01-08  2:29 ` [PATCH v3 08/14] tcp, udp: Remove old auto-forwarding socket arrays David Gibson
2026-01-08  2:29 ` [PATCH v3 09/14] conf, fwd: Check forwarding table for conflicting rules David Gibson
2026-01-12 23:26   ` Stefano Brivio
2026-01-13  5:41     ` David Gibson
2026-01-08  2:29 ` [PATCH v3 10/14] fwd: Generate auto-forward exclusions from socket fd tables David Gibson
2026-01-08  2:29 ` [PATCH v3 11/14] flow, fwd: Consult rules table when forwarding a new flow from socket David Gibson
2026-01-13 22:12   ` Stefano Brivio
2026-01-14  0:09     ` David Gibson
2026-01-08  2:29 ` [PATCH v3 12/14] fwd: Remap ports based directly on forwarding rule David Gibson
2026-01-13 22:12   ` Stefano Brivio
2026-01-14  0:24     ` David Gibson
2026-01-08  2:29 ` [PATCH v3 13/14] fwd, tcp, udp: Add forwarding rule to listening socket epoll references David Gibson
2026-01-13 22:12   ` Stefano Brivio
2026-01-14  0:37     ` David Gibson
2026-01-08  2:29 ` [PATCH v3 14/14] flow, fwd: Optimise forwarding rule lookup using epoll ref when possible David Gibson
2026-01-13 22:13   ` Stefano Brivio [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260113231303.78fb3471@elisabeth \
    --to=sbrivio@redhat.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).