On Wed, Jun 26, 2024 at 12:23:59AM +0200, Stefano Brivio wrote: > On Fri, 14 Jun 2024 16:13:24 +1000 > David Gibson wrote: > > > Require the address and port information for the target (non > > initiating) side to be populated when a flow enters TGT state. > > Implement that for TCP and ICMP. For now this leaves some information > > redundantly recorded in both generic and type specific fields. We'll > > fix that in later patches. > > > > For TCP we now use the information from the flow to construct the > > destination socket address in both tcp_conn_from_tap() and > > tcp_splice_connect(). > > > > Signed-off-by: David Gibson > > --- > > flow.c | 38 ++++++++++++++++++------ > > flow_table.h | 5 +++- > > icmp.c | 3 +- > > inany.h | 30 ++++++++++++++++++- > > tcp.c | 83 ++++++++++++++++++++++++++++------------------------ > > tcp_splice.c | 45 +++++++++++----------------- > > 6 files changed, 126 insertions(+), 78 deletions(-) > > > > diff --git a/flow.c b/flow.c > > index 1819111d..39e046bf 100644 > > --- a/flow.c > > +++ b/flow.c > > @@ -165,8 +165,10 @@ void flow_log_(const struct flow_common *f, int pri, const char *fmt, ...) > > */ > > static void flow_set_state(struct flow_common *f, enum flow_state state) > > { > > - char estr[INANY_ADDRSTRLEN], fstr[INANY_ADDRSTRLEN]; > > + char estr0[INANY_ADDRSTRLEN], fstr0[INANY_ADDRSTRLEN]; > > + char estr1[INANY_ADDRSTRLEN], fstr1[INANY_ADDRSTRLEN]; > > const struct flowside *ini = &f->side[INISIDE]; > > + const struct flowside *tgt = &f->side[TGTSIDE]; > > uint8_t oldstate = f->state; > > > > ASSERT(state < FLOW_NUM_STATES); > > @@ -177,19 +179,24 @@ static void flow_set_state(struct flow_common *f, enum flow_state state) > > FLOW_STATE(f)); > > > > if (MAX(state, oldstate) >= FLOW_STATE_TGT) > > - flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => %s", > > + flow_log_(f, LOG_DEBUG, > > + "%s [%s]:%hu -> [%s]:%hu => %s [%s]:%hu -> [%s]:%hu", > > pif_name(f->pif[INISIDE]), > > - inany_ntop(&ini->eaddr, estr, sizeof(estr)), > > + inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), > > ini->eport, > > - inany_ntop(&ini->faddr, fstr, sizeof(fstr)), > > + inany_ntop(&ini->faddr, fstr0, sizeof(fstr0)), > > ini->fport, > > - pif_name(f->pif[TGTSIDE])); > > + pif_name(f->pif[TGTSIDE]), > > + inany_ntop(&tgt->faddr, fstr1, sizeof(fstr1)), > > + tgt->fport, > > + inany_ntop(&tgt->eaddr, estr1, sizeof(estr1)), > > + tgt->eport); > > else if (MAX(state, oldstate) >= FLOW_STATE_INI) > > flow_log_(f, LOG_DEBUG, "%s [%s]:%hu -> [%s]:%hu => ?", > > pif_name(f->pif[INISIDE]), > > - inany_ntop(&ini->eaddr, estr, sizeof(estr)), > > + inany_ntop(&ini->eaddr, estr0, sizeof(estr0)), > > ini->eport, > > - inany_ntop(&ini->faddr, fstr, sizeof(fstr)), > > + inany_ntop(&ini->faddr, fstr0, sizeof(fstr0)), > > ini->fport); > > } > > > > @@ -261,21 +268,34 @@ const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, > > } > > > > /** > > - * flow_target() - Move flow to TGT, setting TGTSIDE details > > + * flow_target_af() - Move flow to TGT, setting TGTSIDE details > > * @flow: Flow to change state > > * @pif: pif of the target side > > + * @af: Address family for @eaddr and @faddr > > + * @saddr: Source address (pointer to in_addr or in6_addr) > > + * @sport: Endpoint port > > + * @daddr: Destination address (pointer to in_addr or in6_addr) > > + * @dport: Destination port > > + * > > + * Return: pointer to the target flowside information > > */ > > -void flow_target(union flow *flow, uint8_t pif) > > +const struct flowside *flow_target_af(union flow *flow, uint8_t pif, > > + sa_family_t af, > > + const void *saddr, in_port_t sport, > > + const void *daddr, in_port_t dport) > > { > > struct flow_common *f = &flow->f; > > + struct flowside *tgt = &f->side[TGTSIDE]; > > > > ASSERT(pif != PIF_NONE); > > ASSERT(flow_new_entry == flow && f->state == FLOW_STATE_INI); > > ASSERT(f->type == FLOW_TYPE_NONE); > > ASSERT(f->pif[INISIDE] != PIF_NONE && f->pif[TGTSIDE] == PIF_NONE); > > > > + flowside_from_af(tgt, af, daddr, dport, saddr, sport); > > f->pif[TGTSIDE] = pif; > > flow_set_state(f, FLOW_STATE_TGT); > > + return tgt; > > } > > > > /** > > diff --git a/flow_table.h b/flow_table.h > > index 2e912532..7af32c6a 100644 > > --- a/flow_table.h > > +++ b/flow_table.h > > @@ -114,7 +114,10 @@ const struct flowside *flow_initiate_af(union flow *flow, uint8_t pif, > > const struct flowside *flow_initiate_sa(union flow *flow, uint8_t pif, > > const union sockaddr_inany *ssa, > > in_port_t dport); > > -void flow_target(union flow *flow, uint8_t pif); > > +const struct flowside *flow_target_af(union flow *flow, uint8_t pif, > > + sa_family_t af, > > + const void *saddr, in_port_t sport, > > + const void *daddr, in_port_t dport); > > > > union flow *flow_set_type(union flow *flow, enum flow_type type); > > #define FLOW_SET_TYPE(flow_, t_, var_) (&flow_set_type((flow_), (t_))->var_) > > diff --git a/icmp.c b/icmp.c > > index 38d1deeb..c86c54c3 100644 > > --- a/icmp.c > > +++ b/icmp.c > > @@ -167,7 +167,8 @@ static struct icmp_ping_flow *icmp_ping_new(const struct ctx *c, > > return NULL; > > > > flow_initiate_af(flow, PIF_TAP, af, saddr, id, daddr, id); > > - flow_target(flow, PIF_HOST); > > + /* FIXME: Record outbound source address when known */ > > + flow_target_af(flow, PIF_HOST, af, NULL, 0, daddr, 0); > > pingf = FLOW_SET_TYPE(flow, flowtype, ping); > > > > pingf->seq = -1; > > diff --git a/inany.h b/inany.h > > index 47b66fa9..2bf3becf 100644 > > --- a/inany.h > > +++ b/inany.h > > @@ -187,7 +187,6 @@ static inline bool inany_is_unspecified(const union inany_addr *a) > > * > > * Return: true if @a is in fe80::/10 (IPv6 link local unicast) > > */ > > -/* cppcheck-suppress unusedFunction */ > > static inline bool inany_is_linklocal6(const union inany_addr *a) > > { > > return IN6_IS_ADDR_LINKLOCAL(&a->a6); > > @@ -273,4 +272,33 @@ static inline void inany_siphash_feed(struct siphash_state *state, > > > > const char *inany_ntop(const union inany_addr *src, char *dst, socklen_t size); > > > > +/** sockaddr_from_inany() - Construct a sockaddr from an inany > > + * @sa: Pointer to sockaddr to fill in > > + * @sl: Updated to relevant of length of initialised @sa > > + * @addr: IPv[46] address > > + * @port: Port (host byte order) > > + * @scope: Scope ID (ignored for IPv4 addresses) > > Perhaps worth specifying that it's the interface index for IPv6 > link-local addresses only (even if it's not the case as of this patch, > more below): > > * @scope: Scope ID for link-local IPv6 addresses only: interface index Right.. as it happens, I'm reworking this in a way that will obsolete this parameter anyway. > > + */ > > +static inline void sockaddr_from_inany(union sockaddr_inany *sa, socklen_t *sl, > > + const union inany_addr *addr, > > + in_port_t port, uint32_t scope) > > +{ > > + const struct in_addr *v4 = inany_v4(addr); > > + > > + if (v4) { > > + sa->sa_family = AF_INET; > > + sa->sa4.sin_addr = *v4; > > + sa->sa4.sin_port = htons(port); > > + memset(&sa->sa4.sin_zero, 0, sizeof(sa->sa4.sin_zero)); > > + *sl = sizeof(sa->sa4); > > + } else { > > + sa->sa_family = AF_INET6; > > + sa->sa6.sin6_addr = addr->a6; > > + sa->sa6.sin6_port = htons(port); > > + sa->sa6.sin6_scope_id = scope; > > + sa->sa6.sin6_flowinfo = 0; > > + *sl = sizeof(sa->sa6); > > + } > > +} > > + > > #endif /* INANY_H */ > > diff --git a/tcp.c b/tcp.c > > index 07a2eb1c..c6cd0c72 100644 > > --- a/tcp.c > > +++ b/tcp.c > > @@ -1610,18 +1610,10 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, > > { > > in_port_t srcport = ntohs(th->source); > > in_port_t dstport = ntohs(th->dest); > > - struct sockaddr_in addr4 = { > > - .sin_family = AF_INET, > > - .sin_port = htons(dstport), > > - .sin_addr = *(struct in_addr *)daddr, > > - }; > > - struct sockaddr_in6 addr6 = { > > - .sin6_family = AF_INET6, > > - .sin6_port = htons(dstport), > > - .sin6_addr = *(struct in6_addr *)daddr, > > - }; > > - const struct sockaddr *sa; > > + const struct flowside *ini, *tgt; > > struct tcp_tap_conn *conn; > > + union inany_addr dstaddr; /* FIXME: Avoid bulky temporary */ > > + union sockaddr_inany sa; > > union flow *flow; > > int s = -1, mss; > > socklen_t sl; > > @@ -1629,7 +1621,8 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, > > if (!(flow = flow_alloc())) > > return; > > > > - flow_initiate_af(flow, PIF_TAP, af, saddr, srcport, daddr, dstport); > > + ini = flow_initiate_af(flow, PIF_TAP, > > + af, saddr, srcport, daddr, dstport); > > > > if (af == AF_INET) { > > if (IN4_IS_ADDR_UNSPECIFIED(saddr) || > > @@ -1661,19 +1654,28 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, > > dstport); > > goto cancel; > > } > > + } else { > > + ASSERT(0); > > } > > > > if ((s = tcp_conn_sock(c, af)) < 0) > > goto cancel; > > > > + dstaddr = ini->faddr; > > + > > if (!c->no_map_gw) { > > - if (af == AF_INET && IN4_ARE_ADDR_EQUAL(daddr, &c->ip4.gw)) > > - addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); > > - if (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(daddr, &c->ip6.gw)) > > - addr6.sin6_addr = in6addr_loopback; > > + if (inany_equals4(&dstaddr, &c->ip4.gw)) > > + dstaddr = inany_loopback4; > > + else if (inany_equals6(&dstaddr, &c->ip6.gw)) > > + dstaddr = inany_loopback6; > > } > > > > - if (af == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr)) { > > + /* FIXME: Record outbound source address when known */ > > + tgt = flow_target_af(flow, PIF_HOST, AF_INET6, > > + NULL, 0, /* Kernel decides source address */ > > + &dstaddr, dstport); > > + > > + if (inany_is_linklocal6(&tgt->eaddr)) { > > struct sockaddr_in6 addr6_ll = { > > .sin6_family = AF_INET6, > > .sin6_addr = c->ip6.addr_ll, > > @@ -1681,9 +1683,10 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, > > }; > > if (bind(s, (struct sockaddr *)&addr6_ll, sizeof(addr6_ll))) > > goto cancel; > > + } else if (!inany_is_loopback(&tgt->eaddr)) { > > + tcp_bind_outbound(c, s, af); > > } > > > > - flow_target(flow, PIF_HOST); > > conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); > > conn->sock = s; > > conn->timer = -1; > > @@ -1706,14 +1709,6 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, > > > > inany_from_af(&conn->faddr, af, daddr); > > > > - if (af == AF_INET) { > > - sa = (struct sockaddr *)&addr4; > > - sl = sizeof(addr4); > > - } else { > > - sa = (struct sockaddr *)&addr6; > > - sl = sizeof(addr6); > > - } > > - > > conn->fport = dstport; > > conn->eport = srcport; > > > > @@ -1726,19 +1721,16 @@ static void tcp_conn_from_tap(struct ctx *c, sa_family_t af, > > > > tcp_hash_insert(c, conn); > > > > - if (!bind(s, sa, sl)) { > > + sockaddr_from_inany(&sa, &sl, &tgt->eaddr, tgt->eport, c->ifi6); > > ...so, I never tried to use a non-zero scope identifier for, say, > global unicast IPv6 addresses. Perhaps it's harmless. But I still think > that, logically, we should pass it as zero in that case. Right, that will be changed by the rework I have in mind anyway. > > + > > + if (!bind(s, &sa.sa, sl)) { > > tcp_rst(c, conn); /* Nobody is listening then */ > > goto cancel; > > } > > if (errno != EADDRNOTAVAIL && errno != EACCES) > > conn_flag(c, conn, LOCAL); > > This will have a semi-trivial conflict with 54a9d3801b95 ("tcp: Don't > rely on bind() to fail to decide that connection target is valid"). Yeah, I already did that rebase, it was surprisingly fiddly. > > > > > - if ((af == AF_INET && !IN4_IS_ADDR_LOOPBACK(&addr4.sin_addr)) || > > - (af == AF_INET6 && !IN6_IS_ADDR_LOOPBACK(&addr6.sin6_addr) && > > - !IN6_IS_ADDR_LINKLOCAL(&addr6.sin6_addr))) > > - tcp_bind_outbound(c, s, af); > > - > > - if (connect(s, sa, sl)) { > > + if (connect(s, &sa.sa, sl)) { > > if (errno != EINPROGRESS) { > > tcp_rst(c, conn); > > goto cancel; > > @@ -2236,9 +2228,25 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, > > const union sockaddr_inany *sa, > > const struct timespec *now) > > { > > + union inany_addr saddr, daddr; /* FIXME: avoid bulky temporaries */ > > struct tcp_tap_conn *conn; > > + in_port_t srcport; > > + > > + inany_from_sockaddr(&saddr, &srcport, sa); > > + tcp_snat_inbound(c, &saddr); > > > > - flow_target(flow, PIF_TAP); > > + if (inany_v4(&saddr)) { > > + daddr = inany_from_v4(c->ip4.addr_seen); > > + } else { > > + if (inany_is_linklocal6(&saddr)) > > + daddr.a6 = c->ip6.addr_ll_seen; > > + else > > + daddr.a6 = c->ip6.addr_seen; > > + } > > + dstport += c->tcp.fwd_in.delta[dstport]; > > + > > + flow_target_af(flow, PIF_TAP, AF_INET6, > > + &saddr, srcport, &daddr, dstport); > > conn = FLOW_SET_TYPE(flow, FLOW_TCP, tcp); > > > > conn->sock = s; > > @@ -2246,10 +2254,9 @@ static void tcp_tap_conn_from_sock(struct ctx *c, in_port_t dstport, > > conn->ws_to_tap = conn->ws_from_tap = 0; > > conn_event(c, conn, SOCK_ACCEPTED); > > > > - inany_from_sockaddr(&conn->faddr, &conn->fport, sa); > > - conn->eport = dstport + c->tcp.fwd_in.delta[dstport]; > > - > > - tcp_snat_inbound(c, &conn->faddr); > > + conn->faddr = saddr; > > + conn->fport = srcport; > > + conn->eport = dstport; > > > > tcp_seq_init(c, conn, now); > > tcp_hash_insert(c, conn); > > diff --git a/tcp_splice.c b/tcp_splice.c > > index 5a406c63..bcb42c97 100644 > > --- a/tcp_splice.c > > +++ b/tcp_splice.c > > @@ -320,31 +320,20 @@ static int tcp_splice_connect_finish(const struct ctx *c, > > * tcp_splice_connect() - Create and connect socket for new spliced connection > > * @c: Execution context > > * @conn: Connection pointer > > - * @af: Address family > > - * @pif: pif on which to create socket > > - * @port: Destination port, host order > > * > > * Return: 0 for connect() succeeded or in progress, negative value on error > > */ > > -static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, > > - sa_family_t af, uint8_t pif, in_port_t port) > > +static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn) > > { > > - struct sockaddr_in6 addr6 = { > > - .sin6_family = AF_INET6, > > - .sin6_port = htons(port), > > - .sin6_addr = IN6ADDR_LOOPBACK_INIT, > > - }; > > - struct sockaddr_in addr4 = { > > - .sin_family = AF_INET, > > - .sin_port = htons(port), > > - .sin_addr = IN4ADDR_LOOPBACK_INIT, > > - }; > > - const struct sockaddr *sa; > > + const struct flowside *tgt = &conn->f.side[TGTSIDE]; > > + sa_family_t af = inany_v4(&tgt->eaddr) ? AF_INET : AF_INET6; > > + uint8_t tgtpif = conn->f.pif[TGTSIDE]; > > + union sockaddr_inany sa; > > socklen_t sl; > > > > - if (pif == PIF_HOST) > > + if (tgtpif == PIF_HOST) > > conn->s[1] = tcp_conn_sock(c, af); > > - else if (pif == PIF_SPLICE) > > + else if (tgtpif == PIF_SPLICE) > > conn->s[1] = tcp_conn_sock_ns(c, af); > > else > > ASSERT(0); > > @@ -358,15 +347,9 @@ static int tcp_splice_connect(const struct ctx *c, struct tcp_splice_conn *conn, > > conn->s[1]); > > } > > > > - if (CONN_V6(conn)) { > > - sa = (struct sockaddr *)&addr6; > > - sl = sizeof(addr6); > > - } else { > > - sa = (struct sockaddr *)&addr4; > > - sl = sizeof(addr4); > > - } > > + sockaddr_from_inany(&sa, &sl, &tgt->eaddr, tgt->eport, 0); > > > > - if (connect(conn->s[1], sa, sl)) { > > + if (connect(conn->s[1], &sa.sa, sl)) { > > if (errno != EINPROGRESS) { > > flow_trace(conn, "Couldn't connect socket for splice: %s", > > strerror(errno)); > > @@ -471,7 +454,13 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, > > return false; > > } > > > > - flow_target(flow, tgtpif); > > + /* FIXME: Record outbound source address when known */ > > + if (af == AF_INET) > > + flow_target_af(flow, tgtpif, AF_INET, > > + NULL, 0, &in4addr_loopback, dstport); > > + else > > + flow_target_af(flow, tgtpif, AF_INET6, > > + NULL, 0, &in6addr_loopback, dstport); > > conn = FLOW_SET_TYPE(flow, FLOW_TCP_SPLICE, tcp_splice); > > > > conn->flags = af == AF_INET ? 0 : SPLICE_V6; > > @@ -483,7 +472,7 @@ bool tcp_splice_conn_from_sock(const struct ctx *c, > > if (setsockopt(s0, SOL_TCP, TCP_QUICKACK, &((int){ 1 }), sizeof(int))) > > flow_trace(conn, "failed to set TCP_QUICKACK on %i", s0); > > > > - if (tcp_splice_connect(c, conn, af, tgtpif, dstport)) > > + if (tcp_splice_connect(c, conn)) > > conn_flag(c, conn, CLOSING); > > > > FLOW_ACTIVATE(conn); > -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson