On Wed, Jul 10, 2024 at 12:32:02AM +0200, Stefano Brivio wrote: > Nits only, here: > > On Fri, 5 Jul 2024 12:07:17 +1000 > David Gibson wrote: > > > This implements the first steps of tracking UDP packets with the flow table > > rather than it's own (buggy) set of port maps. Specifically we create flow > > its Oops, fixed. > > table entries for datagrams received from a socket (PIF_HOST or > > PIF_SPLICE). > > > > When splitting datagrams from sockets into batches, we group by the flow > > as well as splicesrc. This may result in smaller batches, but makes things > > easier down the line. We can re-optimise this later if necessary. For now > > we don't do anything else with the flow, not even match reply packets to > > the same flow. > > > > Signed-off-by: David Gibson > > --- > > Makefile | 2 +- > > flow.c | 31 ++++++++++ > > flow.h | 4 ++ > > flow_table.h | 14 +++++ > > udp.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++-- > > udp_flow.h | 25 ++++++++ > > 6 files changed, 240 insertions(+), 5 deletions(-) > > create mode 100644 udp_flow.h > > > > diff --git a/Makefile b/Makefile > > index 09fc461d..92cbd5a6 100644 > > --- a/Makefile > > +++ b/Makefile > > @@ -57,7 +57,7 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ > > flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ > > lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ > > siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ > > - udp.h util.h > > + udp.h udp_flow.h util.h > > HEADERS = $(PASST_HEADERS) seccomp.h > > > > C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; > > diff --git a/flow.c b/flow.c > > index 218033ae..0cb9495b 100644 > > --- a/flow.c > > +++ b/flow.c > > @@ -37,6 +37,7 @@ const char *flow_type_str[] = { > > [FLOW_TCP_SPLICE] = "TCP connection (spliced)", > > [FLOW_PING4] = "ICMP ping sequence", > > [FLOW_PING6] = "ICMPv6 ping sequence", > > + [FLOW_UDP] = "UDP flow", > > }; > > static_assert(ARRAY_SIZE(flow_type_str) == FLOW_NUM_TYPES, > > "flow_type_str[] doesn't match enum flow_type"); > > @@ -46,6 +47,7 @@ const uint8_t flow_proto[] = { > > [FLOW_TCP_SPLICE] = IPPROTO_TCP, > > [FLOW_PING4] = IPPROTO_ICMP, > > [FLOW_PING6] = IPPROTO_ICMPV6, > > + [FLOW_UDP] = IPPROTO_UDP, > > }; > > static_assert(ARRAY_SIZE(flow_proto) == FLOW_NUM_TYPES, > > "flow_proto[] doesn't match enum flow_type"); > > @@ -700,6 +702,31 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, > > return flowside_lookup(c, proto, pif, &fside); > > } > > > > +/** > > + * flow_lookup_sa() - Look up a flow given and endpoint socket address > > s/and/an/ Fixed. > > + * @c: Execution context > > + * @proto: Protocol of the flow (IP L4 protocol number) > > + * @pif: Interface of the flow > > + * @esa: Socket address of the endpoint > > + * @fport: Forwarding port number > > + * > > + * Return: sidx of the matching flow & side, FLOW_SIDX_NONE if not found > > + */ > > +flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, > > + const void *esa, in_port_t fport) > > +{ > > + struct flowside fside = { > > And the "f" in "fside" stands for "forwarding"... I don't have any > quick fix in mind, and it's _kind of_ clear anyway, but this makes me > doubt a bit about the "forwarding" / "endpoint" choice of words. Heh, no, here "fside" is simply short for "flowside". Every flowside has both forwarding and endpoint elements. So it is confusing, but for a different reason. I need to find a different convention for naming struct flowside variables. I'd say 'side', but sometimes that's used for the 1-bit integer indicating which side in a flow. Hrm.. now that pif has been removed from here, maybe I could rename struct flowside back to 'flowaddrs' or 'sideaddrs' perhaps? > > + .fport = fport, > > + }; > > + > > + inany_from_sockaddr(&fside.eaddr, &fside.eport, esa); > > + if (inany_v4(&fside.eaddr)) > > + fside.faddr = inany_any4; > > + else > > + fside.faddr = inany_any6; > > The usual extra newline here? Done. > > + return flowside_lookup(c, proto, pif, &fside); > > +} > > + > > /** > > * flow_defer_handler() - Handler for per-flow deferred and timed tasks > > * @c: Execution context > > @@ -779,6 +806,10 @@ void flow_defer_handler(const struct ctx *c, const struct timespec *now) > > if (timer) > > closed = icmp_ping_timer(c, &flow->ping, now); > > break; > > + case FLOW_UDP: > > + if (timer) > > + closed = udp_flow_timer(c, &flow->udp, now); > > + break; > > default: > > /* Assume other flow types don't need any handling */ > > ; > > diff --git a/flow.h b/flow.h > > index e27f99be..3752e5ee 100644 > > --- a/flow.h > > +++ b/flow.h > > @@ -115,6 +115,8 @@ enum flow_type { > > FLOW_PING4, > > /* ICMPv6 echo requests from guest to host and matching replies back */ > > FLOW_PING6, > > + /* UDP pseudo-connection */ > > + FLOW_UDP, > > > > FLOW_NUM_TYPES, > > }; > > @@ -238,6 +240,8 @@ flow_sidx_t flow_lookup_af(const struct ctx *c, > > uint8_t proto, uint8_t pif, sa_family_t af, > > const void *eaddr, const void *faddr, > > in_port_t eport, in_port_t fport); > > +flow_sidx_t flow_lookup_sa(const struct ctx *c, uint8_t proto, uint8_t pif, > > + const void *esa, in_port_t fport); > > > > union flow; > > > > diff --git a/flow_table.h b/flow_table.h > > index 457f27b1..3fbc7c8d 100644 > > --- a/flow_table.h > > +++ b/flow_table.h > > @@ -9,6 +9,7 @@ > > > > #include "tcp_conn.h" > > #include "icmp_flow.h" > > +#include "udp_flow.h" > > > > /** > > * struct flow_free_cluster - Information about a cluster of free entries > > @@ -35,6 +36,7 @@ union flow { > > struct tcp_tap_conn tcp; > > struct tcp_splice_conn tcp_splice; > > struct icmp_ping_flow ping; > > + struct udp_flow udp; > > }; > > > > /* Global Flow Table */ > > @@ -78,6 +80,18 @@ static inline union flow *flow_at_sidx(flow_sidx_t sidx) > > return FLOW(sidx.flow); > > } > > > > +/** flow_sidx_opposite - Get the other side of the same flow > > flow_sidx_opposite() Done. > > + * @sidx: Flow & side index > > + * > > + * Return: sidx for the other side of the same flow as @sidx > > + */ > > +static inline flow_sidx_t flow_sidx_opposite(flow_sidx_t sidx) > > +{ > > + if (!flow_sidx_valid(sidx)) > > + return FLOW_SIDX_NONE; > > Same here with the extra newline. Done. > > + return (flow_sidx_t){.flow = sidx.flow, .side = !sidx.side}; > > +} > > + > > /** flow_sidx_t - Index of one side of a flow from common structure > > * @f: Common flow fields pointer > > * @side: Which side to refer to (0 or 1) > > diff --git a/udp.c b/udp.c > > index 6427b9ce..daf4fe26 100644 > > --- a/udp.c > > +++ b/udp.c > > @@ -15,6 +15,30 @@ > > /** > > * DOC: Theory of Operation > > * > > + * UDP Flows > > + * ========= > > + * > > + * UDP doesn't have true connections, but many protocols use a connection-like > > + * format. The flow is initiated by a client sending a datagram from a port of > > + * its choosing (usually ephemeral) to a specific port (usually well known) on a > > + * server. Both client and server address must be unicast. The server sends > > + * replies using the same addresses & ports with src/dest swapped. > > + * > > + * We track pseudo-connections of this type as flow table entries of type > > + * FLOW_UDP. We store the time of the last traffic on the flow in uflow->ts, > > + * and let the flow expire if there is no traffic for UDP_CONN_TIMEOUT seconds. > > + * > > + * NOTE: This won't handle multicast protocols, or some protocols with different > > + * port usage. We'll need specific logic if we want to handle those. > > + * > > + * "Listening" sockets > > + * =================== > > + * > > + * UDP doesn't use listen(), but we consider long term sockets which are allowed > > + * to create new flows "listening" by analogy with TCP. > > + * > > + * Port tracking > > + * ============= > > * > > * For UDP, a reduced version of port-based connection tracking is implemented > > * with two purposes: > > @@ -121,6 +145,7 @@ > > #include "tap.h" > > #include "pcap.h" > > #include "log.h" > > +#include "flow_table.h" > > > > #define UDP_CONN_TIMEOUT 180 /* s, timeout for ephemeral or local bind */ > > #define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ > > @@ -199,6 +224,7 @@ static struct ethhdr udp6_eth_hdr; > > * @taph: Tap backend specific header > > * @s_in: Source socket address, filled in by recvmmsg() > > * @splicesrc: Source port for splicing, or -1 if not spliceable > > + * @tosidx: sidx for the destination side of this datagram's flow > > */ > > static struct udp_meta_t { > > struct ipv6hdr ip6h; > > @@ -207,6 +233,7 @@ static struct udp_meta_t { > > > > union sockaddr_inany s_in; > > int splicesrc; > > + flow_sidx_t tosidx; > > } > > #ifdef __AVX2__ > > __attribute__ ((aligned(32))) > > @@ -490,6 +517,115 @@ static int udp_mmh_splice_port(union epoll_ref ref, const struct mmsghdr *mmh) > > return -1; > > } > > > > +/** > > + * udp_at_sidx() - Get UDP specific flow at given sidx > > + * @sidx: Flow and side to retrieve > > + * > > + * Return: UDP specific flow at @sidx, or NULL of @sidx is invalid. Asserts if > > + * the flow at @sidx is not FLOW_UDP. > > + */ > > +struct udp_flow *udp_at_sidx(flow_sidx_t sidx) > > +{ > > + union flow *flow = flow_at_sidx(sidx); > > + > > + if (!flow) > > + return NULL; > > + > > + ASSERT(flow->f.type == FLOW_UDP); > > + return &flow->udp; > > +} > > + > > +/* > > + * udp_flow_close() - Close and clean up UDP flow > > + * @c: Execution context > > + * @uflow: UDP flow > > + */ > > +static void udp_flow_close(const struct ctx *c, const struct udp_flow *uflow) > > +{ > > + flow_hash_remove(c, FLOW_SIDX(uflow, INISIDE)); > > +} > > + > > +/** > > + * udp_flow_new() - Common setup for a new UDP flow > > + * @c: Execution context > > + * @flow: Initiated flow > > + * @now: Timestamp > > + * > > + * Return: UDP specific flow, if successful, NULL on failure > > + */ > > +static flow_sidx_t udp_flow_new(const struct ctx *c, union flow *flow, > > + const struct timespec *now) > > +{ > > + const struct flowside *ini = &flow->f.side[INISIDE]; > > + struct udp_flow *uflow = NULL; > > + > > + if (!inany_is_unicast(&ini->eaddr) || ini->eport == 0) { > > + flow_dbg(flow, "Invalid endpoint to initiate UDP flow"); > > Do we risk making debug logs unusable if we see multicast traffic? Um.. I'm not sure. > Maybe this could be flow_trace() instead. Sure, why not. -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson