On Thu, Mar 27, 2025 at 12:51:54PM -0400, Jon Maloy wrote: > As preparation for future enhancements we add ttl/hop limit as creation > and lookup criteria for outgoing flows. This comes in addition to the > regular 4-tuple which is currently used. > > Signed-off-by: Jon Maloy Ah... sorry. I think I was misleading when I suggested adding the TTL to the flow table. This change does not make sense. The TTL is a per-packet property, not a property of the flow. Packets which match the addresses and ports should be considered part of the same flow, even if the TTL is different. Deliberately changing the TTL during a flow would be weird, but it's not illegal. I think it could also happen by accident with certain multipath setups. When I suggested adding the TTL to the flow, I wasn't meaning adding it to flow_common or what we use to lookup flows. Instead I essentially meant a cache of the value we last set for IP_TTL on the socket. We can store that in struct udp_flow. It's probably neatest to use an array of two values, one for each side. We'll rarely have a socket on both sides (spliced), and when we do we can't use this approach anyway, but it'll make the code a bit cleaner to do it that way, because it will parallel the array of (up to) two sockets. > --- > flow.c | 17 ++++++++++++----- > flow.h | 8 ++++++-- > flow_table.h | 3 ++- > icmp.c | 15 ++++++++++----- > icmp.h | 2 +- > packet.h | 2 ++ > tap.c | 25 ++++++++++++++++++------- > tcp.c | 6 +++--- > udp.c | 8 ++++++-- > udp.h | 3 ++- > udp_flow.c | 7 ++++--- > udp_flow.h | 2 +- > 12 files changed, 67 insertions(+), 31 deletions(-) > > diff --git a/flow.c b/flow.c > index 8622242..6796f73 100644 > --- a/flow.c > +++ b/flow.c > @@ -137,10 +137,12 @@ static struct timespec flow_timer_run; > * @eport: Endpoint port > * @oaddr: Our address (pointer to in_addr or in6_addr) > * @oport: Our port > + * @ttl: TTL/hop limit for packets in flow > */ > static void flowside_from_af(struct flowside *side, sa_family_t af, > const void *eaddr, in_port_t eport, > - const void *oaddr, in_port_t oport) > + const void *oaddr, in_port_t oport, > + uint8_t ttl) > { > if (oaddr) > inany_from_af(&side->oaddr, af, oaddr); > @@ -153,6 +155,8 @@ static void flowside_from_af(struct flowside *side, sa_family_t af, > else > side->eaddr = inany_any6; > side->eport = eport; > + > + side->ttl = ttl; > } > > /** > @@ -376,17 +380,19 @@ static void flow_initiate_(union flow *flow, uint8_t pif) > * @sport: Endpoint port > * @daddr: Destination address (pointer to in_addr or in6_addr) > * @dport: Destination port > + * @ttl: TTL/hop_limit for packets in flow > * > * Return: pointer to the initiating flowside information > */ > const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, > sa_family_t af, > const void *saddr, in_port_t sport, > - const void *daddr, in_port_t dport) > + const void *daddr, in_port_t dport, > + uint8_t ttl) > { > struct flowside *ini = &flow->f.side[INISIDE]; > > - flowside_from_af(ini, af, saddr, sport, daddr, dport); > + flowside_from_af(ini, af, saddr, sport, daddr, dport, ttl); > flow_initiate_(flow, pif); > return ini; > } > @@ -731,17 +737,18 @@ static flow_sidx_t flowside_lookup(const struct ctx *c, uint8_t proto, > * @oaddr: Our guest side address (guest remote address) > * @eport: Guest side endpoint port (guest local port) > * @oport: Our guest side port (guest remote port) > + * @ttl: TTL/hop_limit of flow we are looking for > * > * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found > */ > flow_sidx_t flow_lookup_af(const struct ctx *c, > uint8_t proto, uint8_t pif, sa_family_t af, > const void *eaddr, const void *oaddr, > - in_port_t eport, in_port_t oport) > + in_port_t eport, in_port_t oport, uint8_t ttl) > { > struct flowside side; > > - flowside_from_af(&side, af, eaddr, eport, oaddr, oport); > + flowside_from_af(&side, af, eaddr, eport, oaddr, oport, ttl); > return flowside_lookup(c, proto, pif, &side); > } > > diff --git a/flow.h b/flow.h > index dcf7645..2ba4a94 100644 > --- a/flow.h > +++ b/flow.h > @@ -143,12 +143,14 @@ extern const uint8_t flow_proto[]; > * @oaddr: Our address (local address from passt's PoV) > * @eport: Endpoint port > * @oport: Our port > + * @ttl: TTL/hop limit for this flow > */ > struct flowside { > union inany_addr oaddr; > union inany_addr eaddr; > in_port_t oport; > in_port_t eport; > + uint8_t ttl; > }; > > /** > @@ -163,7 +165,8 @@ static inline bool flowside_eq(const struct flowside *left, > return inany_equals(&left->eaddr, &right->eaddr) && > left->eport == right->eport && > inany_equals(&left->oaddr, &right->oaddr) && > - left->oport == right->oport; > + left->oport == right->oport && > + left->ttl == right->ttl; > } > > int flowside_sock_l4(const struct ctx *c, enum epoll_type type, uint8_t pif, > @@ -241,7 +244,8 @@ void flow_hash_remove(const struct ctx *c, flow_sidx_t sidx); > flow_sidx_t flow_lookup_af(const struct ctx *c, > uint8_t proto, uint8_t pif, sa_family_t af, > const void *eaddr, const void *oaddr, > - in_port_t eport, in_port_t oport); > + in_port_t eport, in_port_t oport, > + uint8_t ttl); > flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, > const void *esa, in_port_t oport); > > diff --git a/flow_table.h b/flow_table.h > index fd2c57b..0b5b431 100644 > --- a/flow_table.h > +++ b/flow_table.h > @@ -196,7 +196,8 @@ void flow_alloc_cancel(union flow *flow); > const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, > sa_family_t af, > const void *saddr, in_port_t sport, > - const void *daddr, in_port_t dport); > + const void *daddr, in_port_t dport, > + uint8_t ttl); > struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, > const union sockaddr_inany *ssa, > in_port_t dport); > diff --git a/icmp.c b/icmp.c > index 7e2b342..cbaa000 100644 > --- a/icmp.c > +++ b/icmp.c > @@ -162,12 +162,14 @@ static void icmp_ping_close(const struct ctx *c, > * @id: ICMP id for the new socket > * @saddr: Source address > * @daddr: Destination address > + * @ttl: TTL/hop_imit > * > * Return: Newly opened ping flow, or NULL on failure > */ > static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, > sa_family_t af, uint16_t id, > - const void *saddr, const void *daddr) > + const void *saddr, const void *daddr, > + uint8_t ttl) > { > uint8_t proto = af == AF_INET ? IPPROTO_ICMP : IPPROTO_ICMPV6; > uint8_t flowtype = af == AF_INET ? FLOW_PING4 : FLOW_PING6; > @@ -179,7 +181,7 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, > if (!flow) > return NULL; > > - flow_initiate_af(flow, PIF_TAP, af, saddr, id, daddr, id); > + flow_initiate_af(flow, PIF_TAP, af, saddr, id, daddr, ttl, id); > if (!(tgt = flow_target(c, flow, proto))) > goto cancel; > > @@ -235,7 +237,7 @@ cancel: > * Return: count of consumed packets (always 1, even if malformed) > */ > int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, > - const void *saddr, const void *daddr, > + const void *saddr, const void *daddr, uint8_t ttl, > const struct pool *p, const struct timespec *now) > { > struct icmp_ping_flow *pingf; > @@ -286,11 +288,14 @@ int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, > } > > flow = flow_at_sidx(flow_lookup_af(c, proto, PIF_TAP, > - af, saddr, daddr, id, id)); > + af, saddr, daddr, ttl, id, id)); > > if (flow) > pingf = &flow->ping; > - else if (!(pingf = icmp_ping_new(c, af, id, saddr, daddr))) > + else > + pingf = icmp_ping_new(c, af, id, saddr, daddr, ttl); > + > + if (!pingf) > return 1; > > tgt = &pingf->f.side[TGTSIDE]; > diff --git a/icmp.h b/icmp.h > index 5ce22b5..18168ab 100644 > --- a/icmp.h > +++ b/icmp.h > @@ -13,7 +13,7 @@ struct icmp_ping_flow; > > void icmp_sock_handler(const struct ctx *c, union epoll_ref ref); > int icmp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, > - const void *saddr, const void *daddr, > + const void *saddr, const void *daddr, uint8_t ttl, > const struct pool *p, const struct timespec *now); > void icmp_init(void); > > diff --git a/packet.h b/packet.h > index c94780a..e84e123 100644 > --- a/packet.h > +++ b/packet.h > @@ -11,6 +11,8 @@ > /* Maximum size of a single packet stored in pool, including headers */ > #define PACKET_MAX_LEN ((size_t)UINT16_MAX) > > +#define DEFAULT_TTL 64 > + > /** > * struct pool - Generic pool of packets stored in a buffer > * @buf: Buffer storing packet descriptors, > diff --git a/tap.c b/tap.c > index 3a6fcbe..c7d82ca 100644 > --- a/tap.c > +++ b/tap.c > @@ -563,6 +563,7 @@ PACKET_POOL_DECL(pool_l4, UIO_MAXIOV, pkt_buf); > * @dest: Destination port > * @saddr: Source address > * @daddr: Destination address > + * @ttl: Time to live > * @msg: Array of messages that can be handled in a single call > */ > static struct tap4_l4_t { > @@ -574,6 +575,8 @@ static struct tap4_l4_t { > struct in_addr saddr; > struct in_addr daddr; > > + uint8_t ttl; > + > struct pool_l4_t p; > } tap4_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */]; > > @@ -586,6 +589,7 @@ static struct tap4_l4_t { > * @dest: Destination port > * @saddr: Source address > * @daddr: Destination address > + * @hop_limit: Hop limiit > * @msg: Array of messages that can be handled in a single call > */ > static struct tap6_l4_t { > @@ -598,6 +602,8 @@ static struct tap6_l4_t { > struct in6_addr saddr; > struct in6_addr daddr; > > + uint8_t hop_limit; > + > struct pool_l4_t p; > } tap6_l4[TAP_SEQS /* Arbitrary: TAP_MSGS in theory, so limit in users */]; > > @@ -761,7 +767,7 @@ resume: > packet_add(pkt, l4len, l4h); > icmp_tap_handler(c, PIF_TAP, AF_INET, > &iph->saddr, &iph->daddr, > - pkt, now); > + iph->ttl, pkt, now); > continue; > } > > @@ -786,7 +792,8 @@ resume: > #define L4_MATCH(iph, uh, seq) \ > ((seq)->protocol == (iph)->protocol && \ > (seq)->source == (uh)->source && (seq)->dest == (uh)->dest && \ > - (seq)->saddr.s_addr == (iph)->saddr && (seq)->daddr.s_addr == (iph)->daddr) > + (seq)->saddr.s_addr == (iph)->saddr && \ > + (seq)->daddr.s_addr == (iph)->daddr && (seq)->ttl == (iph)->ttl) > > #define L4_SET(iph, uh, seq) \ > do { \ > @@ -795,6 +802,7 @@ resume: > (seq)->dest = (uh)->dest; \ > (seq)->saddr.s_addr = (iph)->saddr; \ > (seq)->daddr.s_addr = (iph)->daddr; \ > + (seq)->ttl = (iph)->ttl; \ > } while (0) > > if (seq && L4_MATCH(iph, uh, seq) && seq->p.count < UIO_MAXIOV) > @@ -843,7 +851,7 @@ append: > for (k = 0; k < p->count; ) > k += udp_tap_handler(c, PIF_TAP, AF_INET, > &seq->saddr, &seq->daddr, > - p, k, now); > + seq->ttl, p, k, now); > } > } > > @@ -878,6 +886,7 @@ resume: > const struct ethhdr *eh; > const struct udphdr *uh; > struct ipv6hdr *ip6h; > + uint8_t hop_limit; > uint8_t proto; > char *l4h; > > @@ -891,7 +900,7 @@ resume: > > saddr = &ip6h->saddr; > daddr = &ip6h->daddr; > - > + hop_limit = ip6h->hop_limit; > plen = ntohs(ip6h->payload_len); > if (plen != check) > continue; > @@ -938,7 +947,7 @@ resume: > tap_packet_debug(NULL, ip6h, NULL, proto, NULL, 1); > > icmp_tap_handler(c, PIF_TAP, AF_INET6, > - saddr, daddr, pkt, now); > + saddr, daddr, hop_limit, pkt, now); > continue; > } > > @@ -966,7 +975,8 @@ resume: > (seq)->dest == (uh)->dest && \ > (seq)->flow_lbl == ip6_get_flow_lbl(ip6h) && \ > IN6_ARE_ADDR_EQUAL(&(seq)->saddr, saddr) && \ > - IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr)) > + IN6_ARE_ADDR_EQUAL(&(seq)->daddr, daddr) && \ > + (seq)->hop_limit == ip6h->hop_limit) > > #define L4_SET(ip6h, proto, uh, seq) \ > do { \ > @@ -976,6 +986,7 @@ resume: > (seq)->flow_lbl = ip6_get_flow_lbl(ip6h); \ > (seq)->saddr = *saddr; \ > (seq)->daddr = *daddr; \ > + (seq)->hop_limit = ip6h->hop_limit; \ > } while (0) > > if (seq && L4_MATCH(ip6h, proto, uh, seq) && > @@ -1026,7 +1037,7 @@ append: > for (k = 0; k < p->count; ) > k += udp_tap_handler(c, PIF_TAP, AF_INET6, > &seq->saddr, &seq->daddr, > - p, k, now); > + seq->hop_limit, p, k, now); > } > } > > diff --git a/tcp.c b/tcp.c > index fa1d885..5751d21 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -1446,8 +1446,8 @@ static void tcp_conn_from_tap(const struct ctx *c, sa_family_t af, > if (!(flow = flow_alloc())) > return; > > - ini = flow_initiate_af(flow, PIF_TAP, > - af, saddr, srcport, daddr, dstport); > + ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, > + daddr, dstport, DEFAULT_TTL); > > if (!(tgt = flow_target(c, flow, IPPROTO_TCP))) > goto cancel; > @@ -1977,7 +1977,7 @@ int tcp_tap_handler(const struct ctx *c, uint8_t pif, sa_family_t af, > opts = packet_get(p, idx, sizeof(*th), optlen, NULL); > > sidx = flow_lookup_af(c, IPPROTO_TCP, PIF_TAP, af, saddr, daddr, > - ntohs(th->source), ntohs(th->dest)); > + ntohs(th->source), ntohs(th->dest), DEFAULT_TTL); > flow = flow_at_sidx(sidx); > > /* New connection from tap */ > diff --git a/udp.c b/udp.c > index 0c223b4..8a2c593 100644 > --- a/udp.c > +++ b/udp.c > @@ -847,6 +847,7 @@ fail: > * @af: Address family, AF_INET or AF_INET6 > * @saddr: Source address > * @daddr: Destination address > + * @ttl: TTL for packets to be sent in this call > * @p: Pool of UDP packets, with UDP headers > * @idx: Index of first packet to process > * @now: Current timestamp > @@ -857,7 +858,8 @@ fail: > */ > int udp_tap_handler(const struct ctx *c, uint8_t pif, > sa_family_t af, const void *saddr, const void *daddr, > - const struct pool *p, int idx, const struct timespec *now) > + uint8_t ttl, const struct pool *p, int idx, > + const struct timespec *now) > { > const struct flowside *toside; > struct mmsghdr mm[UIO_MAXIOV]; > @@ -883,7 +885,9 @@ int udp_tap_handler(const struct ctx *c, uint8_t pif, > src = ntohs(uh->source); > dst = ntohs(uh->dest); > > - tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, src, dst, now); > + tosidx = udp_flow_from_tap(c, pif, af, saddr, daddr, > + src, dst, ttl, now); > + > if (!(uflow = udp_at_sidx(tosidx))) { > char sstr[INET6_ADDRSTRLEN], dstr[INET6_ADDRSTRLEN]; > > diff --git a/udp.h b/udp.h > index de2df6d..041fad4 100644 > --- a/udp.h > +++ b/udp.h > @@ -15,7 +15,8 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, > uint32_t events, const struct timespec *now); > int udp_tap_handler(const struct ctx *c, uint8_t pif, > sa_family_t af, const void *saddr, const void *daddr, > - const struct pool *p, int idx, const struct timespec *now); > + uint8_t ttl, const struct pool *p, int idx, > + const struct timespec *now); > int udp_sock_init(const struct ctx *c, int ns, const union inany_addr *addr, > const char *ifname, in_port_t port); > int udp_init(struct ctx *c); > diff --git a/udp_flow.c b/udp_flow.c > index bf4b896..db5f709 100644 > --- a/udp_flow.c > +++ b/udp_flow.c > @@ -236,6 +236,7 @@ flow_sidx_t udp_flow_from_sock(const struct ctx *c, union epoll_ref ref, > * @daddr: Destination address guest side > * @srcport: Source port on guest side > * @dstport: Destination port on guest side > + * @ttl: TTL for this flow > * > * Return: sidx for the destination side of the flow for this packet, or > * FLOW_SIDX_NONE if we couldn't find or create a flow. > @@ -244,7 +245,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, > uint8_t pif, sa_family_t af, > const void *saddr, const void *daddr, > in_port_t srcport, in_port_t dstport, > - const struct timespec *now) > + uint8_t ttl, const struct timespec *now) > { > const struct flowside *ini; > struct udp_flow *uflow; > @@ -254,7 +255,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, > ASSERT(pif == PIF_TAP); > > sidx = flow_lookup_af(c, IPPROTO_UDP, pif, af, saddr, daddr, > - srcport, dstport); > + srcport, dstport, ttl); > if ((uflow = udp_at_sidx(sidx))) { > uflow->ts = now->tv_sec; > return flow_sidx_opposite(sidx); > @@ -271,7 +272,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, > } > > ini = flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, > - daddr, dstport); > + daddr, dstport, ttl); > > if (inany_is_unspecified(&ini->eaddr) || ini->eport == 0 || > inany_is_unspecified(&ini->oaddr) || ini->oport == 0) { > diff --git a/udp_flow.h b/udp_flow.h > index 9a1b059..7b40594 100644 > --- a/udp_flow.h > +++ b/udp_flow.h > @@ -31,7 +31,7 @@ flow_sidx_t udp_flow_from_tap(const struct ctx *c, > uint8_t pif, sa_family_t af, > const void *saddr, const void *daddr, > in_port_t srcport, in_port_t dstport, > - const struct timespec *now); > + uint8_t ttl, const struct timespec *now); > void udp_flow_close(const struct ctx *c, struct udp_flow *uflow); > bool udp_flow_defer(const struct udp_flow *uflow); > bool udp_flow_timer(const struct ctx *c, struct udp_flow *uflow, -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson