On Fri, Sep 13, 2024 at 06:20:34PM +0200, Laurent Vivier wrote: > add virtio and vhost-user functions to connect with QEMU. > > $ ./passt --vhost-user > > and > > # qemu-system-x86_64 ... -m 4G \ > -object memory-backend-memfd,id=memfd0,share=on,size=4G \ > -numa node,memdev=memfd0 \ > -chardev socket,id=chr0,path=/tmp/passt_1.socket \ > -netdev vhost-user,id=netdev0,chardev=chr0 \ > -device virtio-net,mac=9a:2b:2c:2d:2e:2f,netdev=netdev0 \ > ... > > Signed-off-by: Laurent Vivier > --- > Makefile | 6 +- > checksum.c | 1 - > conf.c | 23 +- > epoll_type.h | 4 + > isolation.c | 17 +- > packet.c | 11 + > packet.h | 8 +- > passt.1 | 10 +- > passt.c | 26 +- > passt.h | 6 + > pcap.c | 1 - > tap.c | 111 +++++++-- > tap.h | 5 +- > tcp.c | 31 ++- > tcp_buf.c | 8 +- > tcp_internal.h | 3 +- > tcp_vu.c | 647 +++++++++++++++++++++++++++++++++++++++++++++++++ > tcp_vu.h | 12 + > udp.c | 78 +++--- > udp.h | 8 +- > udp_internal.h | 34 +++ > udp_vu.c | 397 ++++++++++++++++++++++++++++++ > udp_vu.h | 13 + > vhost_user.c | 32 +-- > virtio.c | 1 - > vu_common.c | 36 +++ > vu_common.h | 34 +++ > 27 files changed, 1457 insertions(+), 106 deletions(-) > create mode 100644 tcp_vu.c > create mode 100644 tcp_vu.h > create mode 100644 udp_internal.h > create mode 100644 udp_vu.c > create mode 100644 udp_vu.h > create mode 100644 vu_common.c > create mode 100644 vu_common.h > > diff --git a/Makefile b/Makefile > index 0e8ed60a0da1..1e8910dda1f4 100644 > --- a/Makefile > +++ b/Makefile > @@ -54,7 +54,8 @@ FLAGS += -DDUAL_STACK_SOCKETS=$(DUAL_STACK_SOCKETS) > PASST_SRCS = arch.c arp.c checksum.c conf.c dhcp.c dhcpv6.c flow.c fwd.c \ > icmp.c igmp.c inany.c iov.c ip.c isolation.c lineread.c log.c mld.c \ > ndp.c netlink.c packet.c passt.c pasta.c pcap.c pif.c tap.c tcp.c \ > - tcp_buf.c tcp_splice.c udp.c udp_flow.c util.c vhost_user.c virtio.c > + tcp_buf.c tcp_splice.c tcp_vu.c udp.c udp_flow.c udp_vu.c util.c \ > + vhost_user.c virtio.c vu_common.c > QRAP_SRCS = qrap.c > SRCS = $(PASST_SRCS) $(QRAP_SRCS) > > @@ -64,7 +65,8 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h flow.h fwd.h \ > flow_table.h icmp.h icmp_flow.h inany.h iov.h ip.h isolation.h \ > lineread.h log.h ndp.h netlink.h packet.h passt.h pasta.h pcap.h pif.h \ > siphash.h tap.h tcp.h tcp_buf.h tcp_conn.h tcp_internal.h tcp_splice.h \ > - udp.h udp_flow.h util.h vhost_user.h virtio.h > + tcp_vu.h udp.h udp_flow.h udp_internal.h udp_vu.h util.h vhost_user.h \ > + virtio.h vu_common.h > HEADERS = $(PASST_HEADERS) seccomp.h > > C := \#include \nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; > diff --git a/checksum.c b/checksum.c > index 006614fcbb28..aa5b7ae1cb66 100644 > --- a/checksum.c > +++ b/checksum.c > @@ -501,7 +501,6 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) > * > * Return: 16-bit folded, complemented checksum > */ > -/* cppcheck-suppress unusedFunction */ > uint16_t csum_iov(const struct iovec *iov, size_t n, uint32_t init) > { > unsigned int i; > diff --git a/conf.c b/conf.c > index b27588649af3..eb8e1685713a 100644 > --- a/conf.c > +++ b/conf.c > @@ -45,6 +45,7 @@ > #include "lineread.h" > #include "isolation.h" > #include "log.h" > +#include "vhost_user.h" > > /** > * next_chunk - Return the next piece of a string delimited by a character > @@ -769,9 +770,14 @@ static void usage(const char *name, FILE *f, int status) > " default: same interface name as external one\n"); > } else { > fprintf(f, > - " -s, --socket PATH UNIX domain socket path\n" > + " -s, --socket, --socket-path PATH UNIX domain socket path\n" > " default: probe free path starting from " > UNIX_SOCK_PATH "\n", 1); > + fprintf(f, > + " --vhost-user Enable vhost-user mode\n" > + " UNIX domain socket is provided by -s option\n" > + " --print-capabilities print back-end capabilities in JSON format,\n" > + " only meaningful for vhost-user mode\n"); > } > > fprintf(f, > @@ -1291,6 +1297,10 @@ void conf(struct ctx *c, int argc, char **argv) > {"netns-only", no_argument, NULL, 20 }, > {"map-host-loopback", required_argument, NULL, 21 }, > {"map-guest-addr", required_argument, NULL, 22 }, > + {"vhost-user", no_argument, NULL, 23 }, > + /* vhost-user backend program convention */ > + {"print-capabilities", no_argument, NULL, 24 }, > + {"socket-path", required_argument, NULL, 's' }, > { 0 }, > }; > const char *logname = (c->mode == MODE_PASTA) ? "pasta" : "passt"; > @@ -1429,7 +1439,6 @@ void conf(struct ctx *c, int argc, char **argv) > sizeof(c->ip6.ifname_out), "%s", optarg); > if (ret <= 0 || ret >= (int)sizeof(c->ip6.ifname_out)) > die("Invalid interface name: %s", optarg); > - Unrelated change. > break; > case 17: > if (c->mode != MODE_PASTA) > @@ -1468,6 +1477,16 @@ void conf(struct ctx *c, int argc, char **argv) > conf_nat(optarg, &c->ip4.map_guest_addr, > &c->ip6.map_guest_addr, NULL); > break; > + case 23: > + if (c->mode == MODE_PASTA) { > + err("--vhost-user is for passt mode only"); > + usage(argv[0], stdout, EXIT_SUCCESS); > + } > + c->mode = MODE_VU; > + break; > + case 24: > + vu_print_capabilities(); > + break; > case 'd': > c->debug = 1; > c->quiet = 0; > diff --git a/epoll_type.h b/epoll_type.h > index 0ad1efa0ccec..f3ef41584757 100644 > --- a/epoll_type.h > +++ b/epoll_type.h > @@ -36,6 +36,10 @@ enum epoll_type { > EPOLL_TYPE_TAP_PASST, > /* socket listening for qemu socket connections */ > EPOLL_TYPE_TAP_LISTEN, > + /* vhost-user command socket */ > + EPOLL_TYPE_VHOST_CMD, > + /* vhost-user kick event socket */ > + EPOLL_TYPE_VHOST_KICK, > > EPOLL_NUM_TYPES, > }; > diff --git a/isolation.c b/isolation.c > index 45fba1e68b9d..3d5fd60fde46 100644 > --- a/isolation.c > +++ b/isolation.c > @@ -377,14 +377,21 @@ void isolate_postfork(const struct ctx *c) > { > struct sock_fprog prog; > > - prctl(PR_SET_DUMPABLE, 0); > + //prctl(PR_SET_DUMPABLE, 0); Useful during testing, but probably doesn't belong in your final patch. > - if (c->mode == MODE_PASTA) { > - prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); > - prog.filter = filter_pasta; > - } else { > + switch (c->mode) { > + case MODE_PASST: > prog.len = (unsigned short)ARRAY_SIZE(filter_passt); > prog.filter = filter_passt; > + break; > + case MODE_PASTA: > + prog.len = (unsigned short)ARRAY_SIZE(filter_pasta); > + prog.filter = filter_pasta; > + break; > + case MODE_VU: > + prog.len = (unsigned short)ARRAY_SIZE(filter_vu); > + prog.filter = filter_vu; > + break; > } > > if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || > diff --git a/packet.c b/packet.c > index 37489961a37e..e5a78d079231 100644 > --- a/packet.c > +++ b/packet.c > @@ -36,6 +36,17 @@ > static int packet_check_range(const struct pool *p, size_t offset, size_t len, > const char *start, const char *func, int line) > { > + if (p->buf_size == 0) { > + int ret; > + > + ret = vu_packet_check_range((void *)p->buf, offset, len, start); > + > + if (ret == -1) > + trace("cannot find region, %s:%i", func, line); > + > + return ret; > + } > + > if (start < p->buf) { > trace("packet start %p before buffer start %p, " > "%s:%i", (void *)start, (void *)p->buf, func, line); > diff --git a/packet.h b/packet.h > index 8377dcf678bb..3f70e949c066 100644 > --- a/packet.h > +++ b/packet.h > @@ -8,8 +8,10 @@ > > /** > * struct pool - Generic pool of packets stored in a buffer > - * @buf: Buffer storing packet descriptors > - * @buf_size: Total size of buffer > + * @buf: Buffer storing packet descriptors, > + * a struct vu_dev_region array for passt vhost-user mode > + * @buf_size: Total size of buffer, > + * 0 for passt vhost-user mode > * @size: Number of usable descriptors for the pool > * @count: Number of used descriptors for the pool > * @pkt: Descriptors: see macros below > @@ -22,6 +24,8 @@ struct pool { > struct iovec pkt[1]; > }; > > +int vu_packet_check_range(void *buf, size_t offset, size_t len, > + const char *start); > void packet_add_do(struct pool *p, size_t len, const char *start, > const char *func, int line); > void *packet_get_do(const struct pool *p, const size_t idx, > diff --git a/passt.1 b/passt.1 > index 79d134dbe098..822714147be8 100644 > --- a/passt.1 > +++ b/passt.1 > @@ -378,12 +378,20 @@ interface address are configured on a given host interface. > .SS \fBpasst\fR-only options > > .TP > -.BR \-s ", " \-\-socket " " \fIpath > +.BR \-s ", " \-\-socket-path ", " \-\-socket " " \fIpath > Path for UNIX domain socket used by \fBqemu\fR(1) or \fBqrap\fR(1) to connect to > \fBpasst\fR. > Default is to probe a free socket, not accepting connections, starting from > \fI/tmp/passt_1.socket\fR to \fI/tmp/passt_64.socket\fR. > > +.TP > +.BR \-\-vhost-user > +Enable vhost-user. The vhost-user command socket is provided by \fB--socket\fR. > + > +.TP > +.BR \-\-print-capabilities > +Print back-end capabilities in JSON format, only meaningful for vhost-user mode. > + > .TP > .BR \-F ", " \-\-fd " " \fIFD > Pass a pre-opened, connected socket to \fBpasst\fR. Usually the socket is opened > diff --git a/passt.c b/passt.c > index ad6f0bc32df6..b64efeaf346c 100644 > --- a/passt.c > +++ b/passt.c > @@ -74,6 +74,8 @@ char *epoll_type_str[] = { > [EPOLL_TYPE_TAP_PASTA] = "/dev/net/tun device", > [EPOLL_TYPE_TAP_PASST] = "connected qemu socket", > [EPOLL_TYPE_TAP_LISTEN] = "listening qemu socket", > + [EPOLL_TYPE_VHOST_CMD] = "vhost-user command socket", > + [EPOLL_TYPE_VHOST_KICK] = "vhost-user kick socket", > }; > static_assert(ARRAY_SIZE(epoll_type_str) == EPOLL_NUM_TYPES, > "epoll_type_str[] doesn't match enum epoll_type"); > @@ -206,6 +208,7 @@ int main(int argc, char **argv) > struct rlimit limit; > struct timespec now; > struct sigaction sa; > + struct vu_dev vdev; > > clock_gettime(CLOCK_MONOTONIC, &log_start); > > @@ -262,6 +265,8 @@ int main(int argc, char **argv) > pasta_netns_quit_init(&c); > > tap_sock_init(&c); > + if (c.mode == MODE_VU) > + vu_init(&c, &vdev); vhost-user is the "tap" interface in vhost-user mode, so I think the vu_init() could be another branch within tap_sock_init(), rather than invoked from the top level here. Feel free to update the name 'tap_sock_init' to something less inaccurate while you're at it.. > > secret_init(&c); > > @@ -352,14 +357,31 @@ loop: > tcp_timer_handler(&c, ref); > break; > case EPOLL_TYPE_UDP_LISTEN: > - udp_listen_sock_handler(&c, ref, eventmask, &now); > + if (c.mode == MODE_VU) { > + udp_vu_listen_sock_handler(&c, ref, eventmask, > + &now); > + } else { > + udp_buf_listen_sock_handler(&c, ref, eventmask, > + &now); > + } > break; > case EPOLL_TYPE_UDP_REPLY: > - udp_reply_sock_handler(&c, ref, eventmask, &now); > + if (c.mode == MODE_VU) > + udp_vu_reply_sock_handler(&c, ref, eventmask, > + &now); > + else > + udp_buf_reply_sock_handler(&c, ref, eventmask, > + &now); > break; > case EPOLL_TYPE_PING: > icmp_sock_handler(&c, ref); > break; > + case EPOLL_TYPE_VHOST_CMD: > + vu_control_handler(&vdev, c.fd_tap, eventmask); > + break; > + case EPOLL_TYPE_VHOST_KICK: > + vu_kick_cb(&vdev, ref, &now); > + break; > default: > /* Can't happen */ > ASSERT(0); > diff --git a/passt.h b/passt.h > index 031c9b669cc4..a98f043c7e64 100644 > --- a/passt.h > +++ b/passt.h > @@ -25,6 +25,8 @@ union epoll_ref; > #include "fwd.h" > #include "tcp.h" > #include "udp.h" > +#include "udp_vu.h" > +#include "vhost_user.h" > > /* Default address for our end on the tap interface. Bit 0 of byte 0 must be 0 > * (unicast) and bit 1 of byte 1 must be 1 (locally administered). Otherwise > @@ -94,6 +96,7 @@ struct fqdn { > enum passt_modes { > MODE_PASST, > MODE_PASTA, > + MODE_VU, > }; > > /** > @@ -227,6 +230,7 @@ struct ip6_ctx { > * @no_ra: Disable router advertisements > * @low_wmem: Low probed net.core.wmem_max > * @low_rmem: Low probed net.core.rmem_max > + * @vdev: vhost-user device > */ > struct ctx { > enum passt_modes mode; > @@ -287,6 +291,8 @@ struct ctx { > > int low_wmem; > int low_rmem; > + > + struct vu_dev *vdev; At some point I'd like to split off all the tap backend related fields and put them in a struct tap_ctx or similar. Or, I guess a union for the different tap-types. > }; > > void proto_update_l2_buf(const unsigned char *eth_d, > diff --git a/pcap.c b/pcap.c > index 46cc4b0d72b6..7e9c56090041 100644 > --- a/pcap.c > +++ b/pcap.c > @@ -140,7 +140,6 @@ void pcap_multiple(const struct iovec *iov, size_t frame_parts, unsigned int n, > * containing packet data to write, including L2 header > * @iovcnt: Number of buffers (@iov entries) > */ > -/* cppcheck-suppress unusedFunction */ > void pcap_iov(const struct iovec *iov, size_t iovcnt) > { > struct timespec now; > diff --git a/tap.c b/tap.c > index 41af6a6d0c85..3e1b3c13c321 100644 > --- a/tap.c > +++ b/tap.c > @@ -58,6 +58,7 @@ > #include "packet.h" > #include "tap.h" > #include "log.h" > +#include "vhost_user.h" > > /* IPv4 (plus ARP) and IPv6 message batches from tap/guest to IP handlers */ > static PACKET_POOL_NOINIT(pool_tap4, TAP_MSGS, pkt_buf); > @@ -78,16 +79,22 @@ void tap_send_single(const struct ctx *c, const void *data, size_t l2len) > struct iovec iov[2]; > size_t iovcnt = 0; > > - if (c->mode == MODE_PASST) { > + switch (c->mode) { > + case MODE_PASST: > iov[iovcnt] = IOV_OF_LVALUE(vnet_len); > iovcnt++; > - } > - > - iov[iovcnt].iov_base = (void *)data; > - iov[iovcnt].iov_len = l2len; > - iovcnt++; > + /* fall through */ > + case MODE_PASTA: > + iov[iovcnt].iov_base = (void *)data; > + iov[iovcnt].iov_len = l2len; > + iovcnt++; > > - tap_send_frames(c, iov, iovcnt, 1); > + tap_send_frames(c, iov, iovcnt, 1); > + break; > + case MODE_VU: > + vu_send(c->vdev, data, l2len); I'm a bit uneasy re-introducing a parallel send function for the slow path, rather than using a common tap_send_frames() interface. Any chance you can unify those sensibly? Bearing in mind that this _is_ the slow path, so if you have to copy a bunch of stuff, that's ok. > + break; > + } > } > > /** > @@ -406,10 +413,18 @@ size_t tap_send_frames(const struct ctx *c, const struct iovec *iov, > if (!nframes) > return 0; > > - if (c->mode == MODE_PASTA) > + switch (c->mode) { > + case MODE_PASTA: > m = tap_send_frames_pasta(c, iov, bufs_per_frame, nframes); > - else > + break; > + case MODE_PASST: > m = tap_send_frames_passt(c, iov, bufs_per_frame, nframes); > + break; > + case MODE_VU: > + /* fall through */ > + default: > + ASSERT(0); > + } > > if (m < nframes) > debug("tap: failed to send %zu frames of %zu", > @@ -968,7 +983,7 @@ void tap_add_packet(struct ctx *c, ssize_t l2len, char *p) > * tap_sock_reset() - Handle closing or failure of connect AF_UNIX socket > * @c: Execution context > */ > -static void tap_sock_reset(struct ctx *c) > +void tap_sock_reset(struct ctx *c) > { > info("Client connection closed%s", c->one_off ? ", exiting" : ""); > > @@ -979,6 +994,8 @@ static void tap_sock_reset(struct ctx *c) > epoll_ctl(c->epollfd, EPOLL_CTL_DEL, c->fd_tap, NULL); > close(c->fd_tap); > c->fd_tap = -1; > + if (c->mode == MODE_VU) > + vu_cleanup(c->vdev); > } > > /** > @@ -1196,11 +1213,17 @@ static void tap_sock_unix_init(struct ctx *c) > ev.data.u64 = ref.u64; > epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap_listen, &ev); > > - info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); > - info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", > - c->sock_path); > - info("or qrap, for earlier qemu versions:"); > - info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); > + if (c->mode == MODE_VU) { > + info("You can start qemu with:"); > + info(" kvm ... -chardev socket,id=chr0,path=%s -netdev vhost-user,id=netdev0,chardev=chr0 -device virtio-net,netdev=netdev0 -object memory-backend-memfd,id=memfd0,share=on,size=$RAMSIZE -numa node,memdev=memfd0\n", > + c->sock_path); > + } else { > + info("\nYou can now start qemu (>= 7.2, with commit 13c6be96618c):"); > + info(" kvm ... -device virtio-net-pci,netdev=s -netdev stream,id=s,server=off,addr.type=unix,addr.path=%s", > + c->sock_path); > + info("or qrap, for earlier qemu versions:"); > + info(" ./qrap 5 kvm ... -net socket,fd=5 -net nic,model=virtio"); > + } > } > > /** > @@ -1210,8 +1233,8 @@ static void tap_sock_unix_init(struct ctx *c) > */ > void tap_listen_handler(struct ctx *c, uint32_t events) > { > - union epoll_ref ref = { .type = EPOLL_TYPE_TAP_PASST }; > struct epoll_event ev = { 0 }; > + union epoll_ref ref; > int v = INT_MAX / 2; > struct ucred ucred; > socklen_t len; > @@ -1251,6 +1274,10 @@ void tap_listen_handler(struct ctx *c, uint32_t events) > trace("tap: failed to set SO_SNDBUF to %i", v); > > ref.fd = c->fd_tap; > + if (c->mode == MODE_VU) > + ref.type = EPOLL_TYPE_VHOST_CMD; > + else > + ref.type = EPOLL_TYPE_TAP_PASST; > ev.events = EPOLLIN | EPOLLRDHUP; > ev.data.u64 = ref.u64; > epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); > @@ -1312,21 +1339,52 @@ static void tap_sock_tun_init(struct ctx *c) > epoll_ctl(c->epollfd, EPOLL_CTL_ADD, c->fd_tap, &ev); > } > > +/** > + * tap_sock_update_buf() - Set the buffer base and size for the pool of packets > + * @base: Buffer base > + * @size Buffer size > + */ > +void tap_sock_update_buf(void *base, size_t size) > +{ > + int i; > + > + pool_tap4_storage.buf = base; > + pool_tap4_storage.buf_size = size; > + pool_tap6_storage.buf = base; > + pool_tap6_storage.buf_size = size; > + > + for (i = 0; i < TAP_SEQS; i++) { > + tap4_l4[i].p.buf = base; > + tap4_l4[i].p.buf_size = size; > + tap6_l4[i].p.buf = base; > + tap6_l4[i].p.buf_size = size; > + } > +} > + > /** > * tap_sock_init() - Create and set up AF_UNIX socket or tuntap file descriptor > * @c: Execution context > */ > void tap_sock_init(struct ctx *c) > { > - size_t sz = sizeof(pkt_buf); > + size_t sz; > + char *buf; > int i; > > - pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, pkt_buf, sz); > - pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, pkt_buf, sz); > + if (c->mode == MODE_VU) { > + buf = NULL; > + sz = 0; > + } else { > + buf = pkt_buf; > + sz = sizeof(pkt_buf); > + } > + > + pool_tap4_storage = PACKET_INIT(pool_tap4, TAP_MSGS, buf, sz); > + pool_tap6_storage = PACKET_INIT(pool_tap6, TAP_MSGS, buf, sz); > > for (i = 0; i < TAP_SEQS; i++) { > - tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); > - tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, pkt_buf, sz); > + tap4_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, buf, sz); > + tap6_l4[i].p = PACKET_INIT(pool_l4, UIO_MAXIOV, buf, sz); Any chance you could re-use tap_sock_update_buf() for this path that's very similar? > } > > if (c->fd_tap != -1) { /* Passed as --fd */ > @@ -1335,10 +1393,17 @@ void tap_sock_init(struct ctx *c) > > ASSERT(c->one_off); > ref.fd = c->fd_tap; > - if (c->mode == MODE_PASST) > + switch (c->mode) { > + case MODE_PASST: > ref.type = EPOLL_TYPE_TAP_PASST; > - else > + break; > + case MODE_PASTA: > ref.type = EPOLL_TYPE_TAP_PASTA; > + break; > + case MODE_VU: > + ref.type = EPOLL_TYPE_VHOST_CMD; > + break; > + } > > ev.events = EPOLLIN | EPOLLRDHUP; > ev.data.u64 = ref.u64; > diff --git a/tap.h b/tap.h > index ec9e2acec460..c5447f7077eb 100644 > --- a/tap.h > +++ b/tap.h > @@ -40,7 +40,8 @@ static inline struct iovec tap_hdr_iov(const struct ctx *c, > */ > static inline void tap_hdr_update(struct tap_hdr *thdr, size_t l2len) > { > - thdr->vnet_len = htonl(l2len); > + if (thdr) > + thdr->vnet_len = htonl(l2len); > } > > void tap_udp4_send(const struct ctx *c, struct in_addr src, in_port_t sport, > @@ -68,6 +69,8 @@ void tap_handler_pasta(struct ctx *c, uint32_t events, > void tap_handler_passt(struct ctx *c, uint32_t events, > const struct timespec *now); > int tap_sock_unix_open(char *sock_path); > +void tap_sock_reset(struct ctx *c); > +void tap_sock_update_buf(void *base, size_t size); > void tap_sock_init(struct ctx *c); > void tap_flush_pools(void); > void tap_handler(struct ctx *c, const struct timespec *now); > diff --git a/tcp.c b/tcp.c > index f9fe1b9a1330..b4b8864799a8 100644 > --- a/tcp.c > +++ b/tcp.c > @@ -304,6 +304,7 @@ > #include "flow_table.h" > #include "tcp_internal.h" > #include "tcp_buf.h" > +#include "tcp_vu.h" > > /* MSS rounding: see SET_MSS() */ > #define MSS_DEFAULT 536 > @@ -903,6 +904,7 @@ static void tcp_fill_header(struct tcphdr *th, > * @dlen: TCP payload length > * @check: Checksum, if already known > * @seq: Sequence number for this segment > + * @no_tcp_csum: Do not set TCP checksum > * > * Return: The IPv4 payload length, host order > */ > @@ -910,7 +912,7 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, > struct tap_hdr *taph, > struct iphdr *iph, struct tcphdr *th, > size_t dlen, const uint16_t *check, > - uint32_t seq) > + uint32_t seq, bool no_tcp_csum) > { > const struct flowside *tapside = TAPFLOW(conn); > const struct in_addr *src4 = inany_v4(&tapside->oaddr); > @@ -929,7 +931,10 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, > > tcp_fill_header(th, conn, seq); > > - tcp_update_check_tcp4(iph, th); > + if (no_tcp_csum) > + th->check = 0; > + else > + tcp_update_check_tcp4(iph, th); It's at least theoretically possible we could have other use cases for skipping checksums thatn vhost-user, so I'd kind of like to see this change split out to simplify the huge vhost-user patch a bit. > > tap_hdr_update(taph, l3len + sizeof(struct ethhdr)); > > @@ -945,13 +950,14 @@ static size_t tcp_fill_headers4(const struct tcp_tap_conn *conn, > * @dlen: TCP payload length > * @check: Checksum, if already known > * @seq: Sequence number for this segment > + * @no_tcp_csum: Do not set TCP checksum > * > * Return: The IPv6 payload length, host order > */ > static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, > struct tap_hdr *taph, > struct ipv6hdr *ip6h, struct tcphdr *th, > - size_t dlen, uint32_t seq) > + size_t dlen, uint32_t seq, bool no_tcp_csum) > { > const struct flowside *tapside = TAPFLOW(conn); > size_t l4len = dlen + sizeof(*th); > @@ -970,7 +976,10 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, > > tcp_fill_header(th, conn, seq); > > - tcp_update_check_tcp6(ip6h, th); > + if (no_tcp_csum) > + th->check = 0; > + else > + tcp_update_check_tcp6(ip6h, th); > > tap_hdr_update(taph, l4len + sizeof(*ip6h) + sizeof(struct ethhdr)); > > @@ -984,12 +993,14 @@ static size_t tcp_fill_headers6(const struct tcp_tap_conn *conn, > * @dlen: TCP payload length > * @check: Checksum, if already known > * @seq: Sequence number for this segment > + * @no_tcp_csum: Do not set TCP checksum > * > * Return: IP payload length, host order > */ > size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, > struct iovec *iov, size_t dlen, > - const uint16_t *check, uint32_t seq) > + const uint16_t *check, uint32_t seq, > + bool no_tcp_csum) > { > const struct flowside *tapside = TAPFLOW(conn); > const struct in_addr *a4 = inany_v4(&tapside->oaddr); > @@ -998,13 +1009,13 @@ size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, > return tcp_fill_headers4(conn, iov[TCP_IOV_TAP].iov_base, > iov[TCP_IOV_IP].iov_base, > iov[TCP_IOV_PAYLOAD].iov_base, dlen, > - check, seq); > + check, seq, no_tcp_csum); > } > > return tcp_fill_headers6(conn, iov[TCP_IOV_TAP].iov_base, > iov[TCP_IOV_IP].iov_base, > iov[TCP_IOV_PAYLOAD].iov_base, dlen, > - seq); > + seq, no_tcp_csum); > } > > /** > @@ -1237,6 +1248,9 @@ int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, > */ > int tcp_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > { > + if (c->mode == MODE_VU) > + return tcp_vu_send_flag(c, conn, flags); > + > return tcp_buf_send_flag(c, conn, flags); > } > > @@ -1630,6 +1644,9 @@ static int tcp_sock_consume(const struct tcp_tap_conn *conn, uint32_t ack_seq) > */ > static int tcp_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > { > + if (c->mode == MODE_VU) > + return tcp_vu_data_from_sock(c, conn); > + > return tcp_buf_data_from_sock(c, conn); > } > > diff --git a/tcp_buf.c b/tcp_buf.c > index 1a398461a34b..10a663bdfc26 100644 > --- a/tcp_buf.c > +++ b/tcp_buf.c > @@ -320,7 +320,7 @@ int tcp_buf_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > return ret; > } > > - l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq); > + l4len = tcp_l2_buf_fill_headers(conn, iov, optlen, NULL, seq, false); > iov[TCP_IOV_PAYLOAD].iov_len = l4len; > > if (flags & DUP_ACK) { > @@ -381,7 +381,8 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, > tcp4_frame_conns[tcp4_payload_used] = conn; > > iov = tcp4_l2_iov[tcp4_payload_used++]; > - l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq); > + l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, check, seq, > + false); > iov[TCP_IOV_PAYLOAD].iov_len = l4len; > if (tcp4_payload_used > TCP_FRAMES_MEM - 1) > tcp_payload_flush(c); > @@ -389,7 +390,8 @@ static void tcp_data_to_tap(struct ctx *c, struct tcp_tap_conn *conn, > tcp6_frame_conns[tcp6_payload_used] = conn; > > iov = tcp6_l2_iov[tcp6_payload_used++]; > - l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq); > + l4len = tcp_l2_buf_fill_headers(conn, iov, dlen, NULL, seq, > + false); > iov[TCP_IOV_PAYLOAD].iov_len = l4len; > if (tcp6_payload_used > TCP_FRAMES_MEM - 1) > tcp_payload_flush(c); > diff --git a/tcp_internal.h b/tcp_internal.h > index aa8bb64f1f33..e7fe735bfcb4 100644 > --- a/tcp_internal.h > +++ b/tcp_internal.h > @@ -91,7 +91,8 @@ void tcp_rst_do(struct ctx *c, struct tcp_tap_conn *conn); > > size_t tcp_l2_buf_fill_headers(const struct tcp_tap_conn *conn, > struct iovec *iov, size_t dlen, > - const uint16_t *check, uint32_t seq); > + const uint16_t *check, uint32_t seq, > + bool no_tcp_csum); > int tcp_update_seqack_wnd(const struct ctx *c, struct tcp_tap_conn *conn, > int force_seq, struct tcp_info *tinfo); > int tcp_prepare_flags(struct ctx *c, struct tcp_tap_conn *conn, int flags, > diff --git a/tcp_vu.c b/tcp_vu.c > new file mode 100644 > index 000000000000..e3e32d628524 > --- /dev/null > +++ b/tcp_vu.c > @@ -0,0 +1,647 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* tcp_vu.c - TCP L2 vhost-user management functions > + * > + * Copyright Red Hat > + * Author: Laurent Vivier > + */ > + > +#include > +#include > +#include > + > +#include > + > +#include > + > +#include > +#include > + > +#include "util.h" > +#include "ip.h" > +#include "passt.h" > +#include "siphash.h" > +#include "inany.h" > +#include "vhost_user.h" > +#include "tcp.h" > +#include "pcap.h" > +#include "flow.h" > +#include "tcp_conn.h" > +#include "flow_table.h" > +#include "tcp_vu.h" > +#include "tcp_internal.h" > +#include "checksum.h" > +#include "vu_common.h" > + > +/** > + * struct tcp_payload_t - TCP header and data to send segments with payload > + * @th: TCP header > + * @data: TCP data > + */ > +struct tcp_payload_t { > + struct tcphdr th; > + uint8_t data[IP_MAX_MTU - sizeof(struct tcphdr)]; > +}; Can you not share this with the tcp_buf.c version? Surely this one also needs to be ((packed)), yes? > + > +/** > + * struct tcp_flags_t - TCP header and data to send zero-length > + * segments (flags) > + * @th: TCP header > + * @opts TCP options > + */ > +struct tcp_flags_t { > + struct tcphdr th; > + char opts[OPT_MSS_LEN + OPT_WS_LEN + 1]; > +}; Ditto. > +static struct iovec iov_vu[VIRTQUEUE_MAX_SIZE]; IIUC the code below, iov_vu[0] is always the discard buffer, the remainder corresponds to each element in elem[]. So... shouldn't this have VIRTQUEUE_MAX_SIZE+1 elements? > +static struct vu_virtq_element elem[VIRTQUEUE_MAX_SIZE]; I think "elem" is too brief and vague a name for a global. > + > +/** > + * tcp_vu_l2_hdrlen() - return the size of the header in level 2 frame (TCP) > + * @v6: Set for IPv6 packet > + * > + * Return: Return the size of the header > + */ > +static size_t tcp_vu_l2_hdrlen(bool v6) I don't love the name here, since the returned size is not just of the L2 header, but the total of the L4, L3, L2 and backend specific headers. > +{ > + size_t l2_hdrlen; > + > + l2_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) + sizeof(struct ethhdr) + > + sizeof(struct tcphdr); > + > + if (v6) > + l2_hdrlen += sizeof(struct ipv6hdr); > + else > + l2_hdrlen += sizeof(struct iphdr); > + > + return l2_hdrlen; > +} > + > +/** > + * tcp_vu_pcap() - Capture a single frame to pcap file (TCP) Why does this need to be TCP specific? That seems odd.. > + * @c: Execution context > + * @tapside: Address information for one side of the flow > + * @iov: Pointer to the array of IO vectors > + * @iov_used: Length of the array > + * @l4len: IPv4 Payload length Is l4len implied by the total of the lengths in the iov? > + */ > +static void tcp_vu_pcap(const struct ctx *c, const struct flowside *tapside, > + struct iovec *iov, int iov_used, size_t l4len) > +{ > + const struct in_addr *src = inany_v4(&tapside->oaddr); > + const struct in_addr *dst = inany_v4(&tapside->eaddr); I think calling these 'src4' and 'dst4' would be less misleading. > + char *base = iov[0].iov_base; > + size_t size = iov[0].iov_len; IIUC, this is assuming that all the headers are within the first IOV. Is that safe? > + struct tcp_payload_t *bp; > + uint32_t sum; > + > + if (!*c->pcap) > + return; > + > + if (src && dst) { > + bp = vu_payloadv4(base); > + sum = proto_ipv4_header_psum(l4len, IPPROTO_TCP, > + *src, *dst); > + } else { > + bp = vu_payloadv6(base); > + sum = proto_ipv6_header_psum(l4len, IPPROTO_TCP, > + &tapside->oaddr.a6, > + &tapside->eaddr.a6); > + } > + iov[0].iov_base = &bp->th; > + iov[0].iov_len = size - ((char *)iov[0].iov_base - base); > + bp->th.check = 0; > + bp->th.check = csum_iov(iov, iov_used, sum); Patching the checksum in here seems messy. Couldn't we disable the skipping of the checksum if c->pcap instead? > + /* set iov for pcap logging */ > + iov[0].iov_base = base + sizeof(struct virtio_net_hdr_mrg_rxbuf); > + iov[0].iov_len = size - sizeof(struct virtio_net_hdr_mrg_rxbuf); > + > + pcap_iov(iov, iov_used); > + > + /* restore iov[0] */ > + iov[0].iov_base = base; > + iov[0].iov_len = size; > +} > + > +/** > + * tcp_vu_send_flag() - Send segment with flags to vhost-user (no payload) > + * @c: Execution context > + * @conn: Connection pointer > + * @flags: TCP flags: if not set, send segment only if ACK is due > + * > + * Return: negative error code on connection reset, 0 otherwise > + */ > +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags) > +{ > + struct vu_dev *vdev = c->vdev; > + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > + const struct flowside *tapside = TAPFLOW(conn); > + struct virtio_net_hdr_mrg_rxbuf *vh; > + struct iovec l2_iov[TCP_NUM_IOVS]; > + size_t l2len, l4len, optlen; > + struct iovec in_sg; > + struct ethhdr *eh; > + int nb_ack; > + int ret; > + > + elem[0].out_num = 0; > + elem[0].out_sg = NULL; > + elem[0].in_num = 1; > + elem[0].in_sg = &in_sg; Is there a reason to use part of the global array, rather than a local here? > + ret = vu_queue_pop(vdev, vq, &elem[0]); > + if (ret < 0) > + return 0; > + > + if (elem[0].in_num < 1) { > + debug("virtio-net receive queue contains no in buffers"); > + vu_queue_rewind(vq, 1); > + return 0; > + } > + > + vh = elem[0].in_sg[0].iov_base; > + > + vh->hdr = VU_HEADER; > + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) > + vh->num_buffers = htole16(1); > + > + l2_iov[TCP_IOV_TAP].iov_base = NULL; > + l2_iov[TCP_IOV_TAP].iov_len = 0; So.. to me it would seem logical to use TCP_IOV_TAP to cover the virtio_net_hdr_msg_rxbuf. Is there a reason not to? > + l2_iov[TCP_IOV_ETH].iov_base = (char *)elem[0].in_sg[0].iov_base + sizeof(struct virtio_net_hdr_mrg_rxbuf); You could use vu_eth() here, no? > + l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); > + > + eh = l2_iov[TCP_IOV_ETH].iov_base; I think you could do this more neatly by setting eh (with vu_eth()) first, then using IOV_OF_LVALUE(*eh). > + > + memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); > + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); I wonder if it would make sense just to have a single mempcy() of tcp[46]_eth_src, since it's sitting around anyway. > + > + if (CONN_V4(conn)) { > + struct tcp_flags_t *payload; > + struct iphdr *iph; > + uint32_t seq; > + > + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + > + l2_iov[TCP_IOV_ETH].iov_len; > + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); > + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + > + l2_iov[TCP_IOV_IP].iov_len; > + > + eh->h_proto = htons(ETH_P_IP); > + > + iph = l2_iov[TCP_IOV_IP].iov_base; > + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); Likewise I think you can make this neater by setting iph first, then the iov using IOV_OF_LVALUE(*iph). Again, this is assuming that all the headers - and in this case the options too - all fit in the single contiguous buffer we've pulled off the queue. Is that safe? > + > + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; > + payload->th = (struct tcphdr){ > + .doff = offsetof(struct tcp_flags_t, opts) / 4, > + .ack = 1 > + }; Technically this will have some redundant assignments (for all the non-specified fields) with tcp_l2_buf_fill_headers(). It'll be cache hot, so probably not a big deal, but maybe worth thinking about. > + > + seq = conn->seq_to_tap; > + ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); > + if (ret <= 0) { > + vu_queue_rewind(vq, 1); > + return ret; > + } > + > + l4len = tcp_l2_buf_fill_headers(conn, l2_iov, optlen, NULL, seq, > + true); > + /* keep the following assignment for clarity */ > + /* cppcheck-suppress unreadVariable */ > + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; > + > + l2len = l4len + sizeof(*iph) + sizeof(struct ethhdr); > + } else { > + struct tcp_flags_t *payload; > + struct ipv6hdr *ip6h; > + uint32_t seq; > + > + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + > + l2_iov[TCP_IOV_ETH].iov_len; > + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); > + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + > + l2_iov[TCP_IOV_IP].iov_len; > + > + eh->h_proto = htons(ETH_P_IPV6); > + > + ip6h = l2_iov[TCP_IOV_IP].iov_base; > + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); > + > + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; > + payload->th = (struct tcphdr){ > + .doff = offsetof(struct tcp_flags_t, opts) / 4, > + .ack = 1 > + }; > + > + seq = conn->seq_to_tap; > + ret = tcp_prepare_flags(c, conn, flags, &payload->th, payload->opts, &optlen); > + if (ret <= 0) { > + vu_queue_rewind(vq, 1); > + return ret; > + } > + > + l4len = tcp_l2_buf_fill_headers(conn, l2_iov, optlen, NULL, seq, > + true); > + /* keep the following assignment for clarity */ > + /* cppcheck-suppress unreadVariable */ > + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; > + > + l2len = l4len + sizeof(*ip6h) + sizeof(struct ethhdr); > + } > + l2len += sizeof(struct virtio_net_hdr_mrg_rxbuf); The ethhdr and l4len components could also be added in this common line. > + ASSERT(l2len <= elem[0].in_sg[0].iov_len); Hrm.. if you hit this assert, you've already clobbered memory you shouldn't have. > + elem[0].in_sg[0].iov_len = l2len; > + tcp_vu_pcap(c, tapside, &elem[0].in_sg[0], 1, l4len); > + > + vu_queue_fill(vq, &elem[0], l2len, 0); > + nb_ack = 1; > + > + if (flags & DUP_ACK) { > + struct iovec in_sg_dup; > + > + elem[1].out_num = 0; > + elem[1].out_sg = NULL; > + elem[1].in_num = 1; > + elem[1].in_sg = &in_sg_dup; > + ret = vu_queue_pop(vdev, vq, &elem[1]); > + if (ret == 0) { > + if (elem[1].in_num < 1 || elem[1].in_sg[0].iov_len < l2len) { > + vu_queue_rewind(vq, 1); > + } else { > + memcpy(elem[1].in_sg[0].iov_base, vh, l2len); > + nb_ack++; > + > + tcp_vu_pcap(c, tapside, &elem[1].in_sg[0], 1, > + l4len); > + > + vu_queue_fill(vq, &elem[1], l2len, 1); > + } > + } > + } > + > + vu_queue_flush(vq, nb_ack); > + vu_queue_notify(vdev, vq); Is there a reason to do this here as we queue each packet, rather than deferring to the same point we call tcp_payload_flush() in the non-VU path? > + > + return 0; > +} > + > +/** tcp_vu_sock_recv() - Receive datastream from socket into vhost-user buffers > + * @c: Execution context > + * @conn: Connection pointer > + * @v4: Set for IPv4 connections > + * @fillsize: Number of bytes we can receive > + * @datalen: Size of received data (output) > + * > + * Return: Number of iov entries used to store the data > + */ > +static ssize_t tcp_vu_sock_recv(struct ctx *c, > + struct tcp_tap_conn *conn, bool v4, > + size_t fillsize, ssize_t *data_len) > +{ > + struct vu_dev *vdev = c->vdev; > + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > + static struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; > + struct msghdr mh_sock = { 0 }; > + uint16_t mss = MSS_GET(conn); > + static int in_sg_count; > + int s = conn->sock; > + size_t l2_hdrlen; > + int segment_size; > + int iov_cnt; > + ssize_t ret; > + > + l2_hdrlen = tcp_vu_l2_hdrlen(!v4); > + > + iov_cnt = 0; > + in_sg_count = 0; > + segment_size = 0; I'm finding it pretty hard to figure out what segment_size represents. It seems to be mostly a flag indicating if you're in the middle of a single packet or not? > + *data_len = 0; > + while (fillsize > 0 && iov_cnt < VIRTQUEUE_MAX_SIZE - 1 && > + in_sg_count < ARRAY_SIZE(in_sg)) { > + > + elem[iov_cnt].out_num = 0; > + elem[iov_cnt].out_sg = NULL; > + elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count; > + elem[iov_cnt].in_sg = &in_sg[in_sg_count]; > + ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]); > + if (ret < 0) > + break; > + > + if (elem[iov_cnt].in_num < 1) { > + warn("virtio-net receive queue contains no in buffers"); > + break; > + } > + > + in_sg_count += elem[iov_cnt].in_num; > + > + ASSERT(elem[iov_cnt].in_num == 1); This seems odd to me. If vu_queue_pop() always returns a single buffer, why does its interface seem set up to return multiple? > + ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen); It should be safe, but logically I think you only want this in the case you're putting the headers in this buffer (segment_size == 0?) > + if (segment_size == 0) { > + iov_vu[iov_cnt + 1].iov_base = > + (char *)elem[iov_cnt].in_sg[0].iov_base + l2_hdrlen; > + iov_vu[iov_cnt + 1].iov_len = > + elem[iov_cnt].in_sg[0].iov_len - l2_hdrlen; > + } else { > + iov_vu[iov_cnt + 1].iov_base = elem[iov_cnt].in_sg[0].iov_base; > + iov_vu[iov_cnt + 1].iov_len = elem[iov_cnt].in_sg[0].iov_len; > + } > + > + if (iov_vu[iov_cnt + 1].iov_len > fillsize) > + iov_vu[iov_cnt + 1].iov_len = fillsize; > + > + segment_size += iov_vu[iov_cnt + 1].iov_len; > + if (!vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { > + segment_size = 0; > + } else if (segment_size >= mss) { > + iov_vu[iov_cnt + 1].iov_len -= segment_size - mss; > + segment_size = 0; > + } > + fillsize -= iov_vu[iov_cnt + 1].iov_len; > + > + iov_cnt++; > + } > + if (iov_cnt == 0) > + return 0; > + > + mh_sock.msg_iov = iov_vu; > + mh_sock.msg_iovlen = iov_cnt + 1; > + > + do > + ret = recvmsg(s, &mh_sock, MSG_PEEK); > + while (ret < 0 && errno == EINTR); > + > + if (ret < 0) { > + vu_queue_rewind(vq, iov_cnt); > + if (errno != EAGAIN && errno != EWOULDBLOCK) { > + ret = -errno; > + tcp_rst(c, conn); > + } > + return ret; > + } > + if (!ret) { > + vu_queue_rewind(vq, iov_cnt); > + > + if ((conn->events & (SOCK_FIN_RCVD | TAP_FIN_SENT)) == SOCK_FIN_RCVD) { > + int retf = tcp_vu_send_flag(c, conn, FIN | ACK); > + if (retf) { > + tcp_rst(c, conn); > + return retf; > + } > + > + conn_event(c, conn, TAP_FIN_SENT); > + } > + return 0; > + } > + > + *data_len = ret; > + return iov_cnt; > +} > + > +/** > + * tcp_vu_prepare() - Prepare the packet header > + * @c: Execution context > + * @conn: Connection pointer > + * @first: Pointer to the array of IO vectors > + * @data_len: Packet data length > + * @check: Checksum, if already known > + * > + * Return: Level-4 length > + */ > +static size_t tcp_vu_prepare(const struct ctx *c, > + struct tcp_tap_conn *conn, struct iovec *first, > + size_t data_len, const uint16_t **check) > +{ > + const struct flowside *toside = TAPFLOW(conn); > + struct iovec l2_iov[TCP_NUM_IOVS]; > + char *base = first->iov_base; > + struct ethhdr *eh; > + size_t l4len; > + > + /* we guess the first iovec provided by the guest can embed > + * all the headers needed by L2 frame > + */ > + > + l2_iov[TCP_IOV_TAP].iov_base = NULL; > + l2_iov[TCP_IOV_TAP].iov_len = 0; > + l2_iov[TCP_IOV_ETH].iov_base = base + sizeof(struct virtio_net_hdr_mrg_rxbuf); > + l2_iov[TCP_IOV_ETH].iov_len = sizeof(struct ethhdr); > + > + eh = l2_iov[TCP_IOV_ETH].iov_base; > + > + memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); > + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); > + > + /* initialize header */ > + if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { > + struct tcp_payload_t *payload; > + struct iphdr *iph; > + > + ASSERT(first[0].iov_len >= sizeof(struct virtio_net_hdr_mrg_rxbuf) + > + sizeof(struct ethhdr) + sizeof(struct iphdr) + > + sizeof(struct tcphdr)); > + > + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + > + l2_iov[TCP_IOV_ETH].iov_len; > + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct iphdr); > + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + > + l2_iov[TCP_IOV_IP].iov_len; > + > + > + eh->h_proto = htons(ETH_P_IP); > + > + iph = l2_iov[TCP_IOV_IP].iov_base; > + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_TCP); > + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; > + payload->th = (struct tcphdr){ > + .doff = offsetof(struct tcp_payload_t, data) / 4, > + .ack = 1 > + }; > + > + l4len = tcp_l2_buf_fill_headers(conn, l2_iov, data_len, *check, > + conn->seq_to_tap, true); > + /* keep the following assignment for clarity */ > + /* cppcheck-suppress unreadVariable */ > + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; > + > + *check = &iph->check; > + } else { > + struct tcp_payload_t *payload; > + struct ipv6hdr *ip6h; > + > + ASSERT(first[0].iov_len >= sizeof(struct virtio_net_hdr_mrg_rxbuf) + > + sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + > + sizeof(struct tcphdr)); > + > + l2_iov[TCP_IOV_IP].iov_base = (char *)l2_iov[TCP_IOV_ETH].iov_base + > + l2_iov[TCP_IOV_ETH].iov_len; > + l2_iov[TCP_IOV_IP].iov_len = sizeof(struct ipv6hdr); > + l2_iov[TCP_IOV_PAYLOAD].iov_base = (char *)l2_iov[TCP_IOV_IP].iov_base + > + l2_iov[TCP_IOV_IP].iov_len; > + > + > + eh->h_proto = htons(ETH_P_IPV6); > + > + ip6h = l2_iov[TCP_IOV_IP].iov_base; > + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_TCP); > + > + payload = l2_iov[TCP_IOV_PAYLOAD].iov_base; > + payload->th = (struct tcphdr){ > + .doff = offsetof(struct tcp_payload_t, data) / 4, > + .ack = 1 > + }; > +; > + l4len = tcp_l2_buf_fill_headers(conn, l2_iov, data_len, NULL, > + conn->seq_to_tap, true); > + /* keep the following assignment for clarity */ > + /* cppcheck-suppress unreadVariable */ > + l2_iov[TCP_IOV_PAYLOAD].iov_len = l4len; > + } > + > + return l4len; > +} > + > +/** > + * tcp_vu_data_from_sock() - Handle new data from socket, queue to vhost-user, > + * in window > + * @c: Execution context > + * @conn: Connection pointer > + * > + * Return: Negative on connection reset, 0 otherwise > + */ > +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn) > +{ > + uint32_t wnd_scaled = conn->wnd_from_tap << conn->ws_from_tap; > + struct vu_dev *vdev = c->vdev; > + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > + const struct flowside *tapside = TAPFLOW(conn); > + uint16_t mss = MSS_GET(conn); > + size_t l2_hdrlen, fillsize; > + int i, iov_cnt, iov_used; > + int v4 = CONN_V4(conn); > + uint32_t already_sent = 0; > + const uint16_t *check; > + struct iovec *first; > + int segment_size; > + int num_buffers; > + ssize_t len; > + > + if (!vu_queue_enabled(vq) || !vu_queue_started(vq)) { > + flow_err(conn, > + "Got packet, but RX virtqueue not usable yet"); > + return 0; > + } > + > + already_sent = conn->seq_to_tap - conn->seq_ack_from_tap; > + > + if (SEQ_LT(already_sent, 0)) { > + /* RFC 761, section 2.1. */ > + flow_trace(conn, "ACK sequence gap: ACK for %u, sent: %u", > + conn->seq_ack_from_tap, conn->seq_to_tap); > + conn->seq_to_tap = conn->seq_ack_from_tap; > + already_sent = 0; > + } > + > + if (!wnd_scaled || already_sent >= wnd_scaled) { > + conn_flag(c, conn, STALLED); > + conn_flag(c, conn, ACK_FROM_TAP_DUE); > + return 0; > + } > + > + /* Set up buffer descriptors we'll fill completely and partially. */ > + > + fillsize = wnd_scaled; > + > + if (peek_offset_cap) > + already_sent = 0; > + > + iov_vu[0].iov_base = tcp_buf_discard; > + iov_vu[0].iov_len = already_sent; > + fillsize -= already_sent; I think this line needs to go before the peek_offset_cap check. If we have PEEK_OFFSET, we do reduce the amount we read(peek) from the socket, but that unacknowledged data still needs to count against the amount of the window we have left. > + /* collect the buffers from vhost-user and fill them with the > + * data from the socket > + */ > + iov_cnt = tcp_vu_sock_recv(c, conn, v4, fillsize, &len); > + if (iov_cnt <= 0) > + return iov_cnt; > + > + len -= already_sent; > + if (len <= 0) { > + conn_flag(c, conn, STALLED); > + vu_queue_rewind(vq, iov_cnt); > + return 0; > + } > + > + conn_flag(c, conn, ~STALLED); > + > + /* Likely, some new data was acked too. */ > + tcp_update_seqack_wnd(c, conn, 0, NULL); > + > + /* initialize headers */ > + l2_hdrlen = tcp_vu_l2_hdrlen(!v4); > + iov_used = 0; > + num_buffers = 0; > + check = NULL; > + segment_size = 0; > + > + /* iov_vu is an array of buffers and the buffer size can be > + * smaller than the segment size we want to use but with > + * num_buffer we can merge several virtio iov buffers in one packet > + * we need only to set the packet headers in the first iov and > + * num_buffer to the number of iov entries > + */ > + for (i = 0; i < iov_cnt && len; i++) { > + > + if (segment_size == 0) > + first = &iov_vu[i + 1]; > + > + if (iov_vu[i + 1].iov_len > (size_t)len) > + iov_vu[i + 1].iov_len = len; > + > + len -= iov_vu[i + 1].iov_len; > + iov_used++; > + > + segment_size += iov_vu[i + 1].iov_len; > + num_buffers++; > + > + if (segment_size >= mss || len == 0 || > + i + 1 == iov_cnt || !vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) { > + struct virtio_net_hdr_mrg_rxbuf *vh; > + size_t l4len; > + > + if (i + 1 == iov_cnt) > + check = NULL; > + > + /* restore first iovec base: point to vnet header */ > + first->iov_base = (char *)first->iov_base - l2_hdrlen; > + first->iov_len = first->iov_len + l2_hdrlen; > + > + vh = first->iov_base; > + > + vh->hdr = VU_HEADER; > + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) > + vh->num_buffers = htole16(num_buffers); > + > + l4len = tcp_vu_prepare(c, conn, first, segment_size, &check); > + > + tcp_vu_pcap(c, tapside, first, num_buffers, l4len); > + > + conn->seq_to_tap += segment_size; > + > + segment_size = 0; > + num_buffers = 0; > + } > + } > + > + /* release unused buffers */ > + vu_queue_rewind(vq, iov_cnt - iov_used); > + > + /* send packets */ > + vu_send_frame(vdev, vq, elem, &iov_vu[1], iov_used); I think that would be better called vu_send_frames(), since it can send multiple frames IIUC (and that matches tap_send_frames()). > + > + conn_flag(c, conn, ACK_FROM_TAP_DUE); > + > + return 0; > +} > diff --git a/tcp_vu.h b/tcp_vu.h > new file mode 100644 > index 000000000000..b433c3e0d06f > --- /dev/null > +++ b/tcp_vu.h > @@ -0,0 +1,12 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* Copyright Red Hat > + * Author: Laurent Vivier > + */ > + > +#ifndef TCP_VU_H > +#define TCP_VU_H > + > +int tcp_vu_send_flag(struct ctx *c, struct tcp_tap_conn *conn, int flags); > +int tcp_vu_data_from_sock(struct ctx *c, struct tcp_tap_conn *conn); > + > +#endif /*TCP_VU_H */ > diff --git a/udp.c b/udp.c > index 2ba00c9c20a8..f7b5b5eb6421 100644 > --- a/udp.c > +++ b/udp.c > @@ -109,8 +109,7 @@ > #include "pcap.h" > #include "log.h" > #include "flow_table.h" > - > -#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ > +#include "udp_internal.h" > > /* "Spliced" sockets indexed by bound port (host order) */ > static int udp_splice_ns [IP_VERSIONS][NUM_PORTS]; > @@ -118,20 +117,8 @@ static int udp_splice_init[IP_VERSIONS][NUM_PORTS]; > > /* Static buffers */ > > -/** > - * struct udp_payload_t - UDP header and data for inbound messages > - * @uh: UDP header > - * @data: UDP data > - */ > -static struct udp_payload_t { > - struct udphdr uh; > - char data[USHRT_MAX - sizeof(struct udphdr)]; > -#ifdef __AVX2__ > -} __attribute__ ((packed, aligned(32))) > -#else > -} __attribute__ ((packed, aligned(__alignof__(unsigned int)))) > -#endif > -udp_payload[UDP_MAX_FRAMES]; > +/* UDP header and data for inbound messages */ > +static struct udp_payload_t udp_payload[UDP_MAX_FRAMES]; > > /* Ethernet header for IPv4 frames */ > static struct ethhdr udp4_eth_hdr; > @@ -298,11 +285,13 @@ static void udp_splice_send(const struct ctx *c, size_t start, size_t n, > * @bp: Pointer to udp_payload_t to update > * @toside: Flowside for destination side > * @dlen: Length of UDP payload > + * @no_udp_csum: Do not set UPD checksum s/UPD/UDP/ > * > * Return: size of IPv4 payload (UDP header + data) > */ > -static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, > - const struct flowside *toside, size_t dlen) > +size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, > + const struct flowside *toside, size_t dlen, > + bool no_udp_csum) > { > const struct in_addr *src = inany_v4(&toside->oaddr); > const struct in_addr *dst = inany_v4(&toside->eaddr); > @@ -319,7 +308,10 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, > bp->uh.source = htons(toside->oport); > bp->uh.dest = htons(toside->eport); > bp->uh.len = htons(l4len); > - csum_udp4(&bp->uh, *src, *dst, bp->data, dlen); > + if (no_udp_csum) > + bp->uh.check = 0; > + else > + csum_udp4(&bp->uh, *src, *dst, bp->data, dlen); As with TCP, I think splitting out the checksum suppression into a preliminary patch would make things easier to follow. > > return l4len; > } > @@ -330,11 +322,13 @@ static size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, > * @bp: Pointer to udp_payload_t to update > * @toside: Flowside for destination side > * @dlen: Length of UDP payload > + * @no_udp_csum: Do not set UPD checksum UPD > * > * Return: size of IPv6 payload (UDP header + data) > */ > -static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, > - const struct flowside *toside, size_t dlen) > +size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, > + const struct flowside *toside, size_t dlen, > + bool no_udp_csum) > { > uint16_t l4len = dlen + sizeof(bp->uh); > > @@ -348,7 +342,16 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, > bp->uh.source = htons(toside->oport); > bp->uh.dest = htons(toside->eport); > bp->uh.len = ip6h->payload_len; > - csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, bp->data, dlen); > + if (no_udp_csum) { > + /* O is an invalid checksum for UDP IPv6 and dropped by > + * the kernel stack, even if the checksum is disabled by virtio > + * flags. We need to put any non-zero value here. > + */ > + bp->uh.check = 0xffff; > + } else { > + csum_udp6(&bp->uh, &toside->oaddr.a6, &toside->eaddr.a6, > + bp->data, dlen); > + } > > return l4len; > } > @@ -358,9 +361,11 @@ static size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, > * @mmh: Receiving mmsghdr array > * @idx: Index of the datagram to prepare > * @toside: Flowside for destination side > + * @no_udp_csum: Do not set UPD checksum > */ > -static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx, > - const struct flowside *toside) > +static void udp_tap_prepare(const struct mmsghdr *mmh, > + unsigned idx, const struct flowside *toside, > + bool no_udp_csum) > { > struct iovec (*tap_iov)[UDP_NUM_IOVS] = &udp_l2_iov[idx]; > struct udp_payload_t *bp = &udp_payload[idx]; > @@ -368,13 +373,15 @@ static void udp_tap_prepare(const struct mmsghdr *mmh, unsigned idx, > size_t l4len; > > if (!inany_v4(&toside->eaddr) || !inany_v4(&toside->oaddr)) { > - l4len = udp_update_hdr6(&bm->ip6h, bp, toside, mmh[idx].msg_len); > + l4len = udp_update_hdr6(&bm->ip6h, bp, toside, > + mmh[idx].msg_len, no_udp_csum); > tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip6h) + > sizeof(udp6_eth_hdr)); > (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp6_eth_hdr); > (*tap_iov)[UDP_IOV_IP] = IOV_OF_LVALUE(bm->ip6h); > } else { > - l4len = udp_update_hdr4(&bm->ip4h, bp, toside, mmh[idx].msg_len); > + l4len = udp_update_hdr4(&bm->ip4h, bp, toside, > + mmh[idx].msg_len, no_udp_csum); > tap_hdr_update(&bm->taph, l4len + sizeof(bm->ip4h) + > sizeof(udp4_eth_hdr)); > (*tap_iov)[UDP_IOV_ETH] = IOV_OF_LVALUE(udp4_eth_hdr); > @@ -447,7 +454,7 @@ static int udp_sock_recverr(int s) > * > * Return: Number of errors handled, or < 0 if we have an unrecoverable error > */ > -static int udp_sock_errs(const struct ctx *c, int s, uint32_t events) > +int udp_sock_errs(const struct ctx *c, int s, uint32_t events) > { > unsigned n_err = 0; > socklen_t errlen; > @@ -524,7 +531,7 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, > } > > /** > - * udp_listen_sock_handler() - Handle new data from socket > + * udp_buf_listen_sock_handler() - Handle new data from socket > * @c: Execution context > * @ref: epoll reference > * @events: epoll events bitmap > @@ -532,8 +539,8 @@ static int udp_sock_recv(const struct ctx *c, int s, uint32_t events, > * > * #syscalls recvmmsg > */ > -void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, > - uint32_t events, const struct timespec *now) > +void udp_buf_listen_sock_handler(const struct ctx *c, union epoll_ref ref, > + uint32_t events, const struct timespec *now) > { > const socklen_t sasize = sizeof(udp_meta[0].s_in); > int n, i; > @@ -565,7 +572,8 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, > udp_splice_prepare(udp_mh_recv, i); > } else if (batchpif == PIF_TAP) { > udp_tap_prepare(udp_mh_recv, i, > - flowside_at_sidx(batchsidx)); > + flowside_at_sidx(batchsidx), > + false); > } > > if (++i >= n) > @@ -599,7 +607,7 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, > } > > /** > - * udp_reply_sock_handler() - Handle new data from flow specific socket > + * udp_buf_reply_sock_handler() - Handle new data from flow specific socket > * @c: Execution context > * @ref: epoll reference > * @events: epoll events bitmap > @@ -607,8 +615,8 @@ void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, > * > * #syscalls recvmmsg > */ > -void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, > - uint32_t events, const struct timespec *now) > +void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, > + uint32_t events, const struct timespec *now) > { > flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); > const struct flowside *toside = flowside_at_sidx(tosidx); > @@ -636,7 +644,7 @@ void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, > if (pif_is_socket(topif)) > udp_splice_prepare(udp_mh_recv, i); > else if (topif == PIF_TAP) > - udp_tap_prepare(udp_mh_recv, i, toside); > + udp_tap_prepare(udp_mh_recv, i, toside, false); > /* Restore sockaddr length clobbered by recvmsg() */ > udp_mh_recv[i].msg_hdr.msg_namelen = sizeof(udp_meta[i].s_in); > } > diff --git a/udp.h b/udp.h > index a8e76bfe8f37..ea23fb36b637 100644 > --- a/udp.h > +++ b/udp.h > @@ -9,10 +9,10 @@ > #define UDP_TIMER_INTERVAL 1000 /* ms */ > > void udp_portmap_clear(void); > -void udp_listen_sock_handler(const struct ctx *c, union epoll_ref ref, > - uint32_t events, const struct timespec *now); > -void udp_reply_sock_handler(const struct ctx *c, union epoll_ref ref, > - uint32_t events, const struct timespec *now); > +void udp_buf_listen_sock_handler(const struct ctx *c, union epoll_ref ref, > + uint32_t events, const struct timespec *now); > +void udp_buf_reply_sock_handler(const struct ctx *c, union epoll_ref ref, > + uint32_t events, const struct timespec *now); > int udp_tap_handler(const struct ctx *c, uint8_t pif, > sa_family_t af, const void *saddr, const void *daddr, > const struct pool *p, int idx, const struct timespec *now); > diff --git a/udp_internal.h b/udp_internal.h > new file mode 100644 > index 000000000000..cc80e3055423 > --- /dev/null > +++ b/udp_internal.h > @@ -0,0 +1,34 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later > + * Copyright (c) 2021 Red Hat GmbH > + * Author: Stefano Brivio > + */ > + > +#ifndef UDP_INTERNAL_H > +#define UDP_INTERNAL_H > + > +#include "tap.h" /* needed by udp_meta_t */ > + > +#define UDP_MAX_FRAMES 32 /* max # of frames to receive at once */ > + > +/** > + * struct udp_payload_t - UDP header and data for inbound messages > + * @uh: UDP header > + * @data: UDP data > + */ > +struct udp_payload_t { > + struct udphdr uh; > + char data[USHRT_MAX - sizeof(struct udphdr)]; > +#ifdef __AVX2__ > +} __attribute__ ((packed, aligned(32))); > +#else > +} __attribute__ ((packed, aligned(__alignof__(unsigned int)))); > +#endif > + > +size_t udp_update_hdr4(struct iphdr *ip4h, struct udp_payload_t *bp, > + const struct flowside *toside, size_t dlen, > + bool no_udp_csum); > +size_t udp_update_hdr6(struct ipv6hdr *ip6h, struct udp_payload_t *bp, > + const struct flowside *toside, size_t dlen, > + bool no_udp_csum); > +int udp_sock_errs(const struct ctx *c, int s, uint32_t events); > +#endif /* UDP_INTERNAL_H */ > diff --git a/udp_vu.c b/udp_vu.c > new file mode 100644 > index 000000000000..fa390dec994a > --- /dev/null > +++ b/udp_vu.c > @@ -0,0 +1,397 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* udp_vu.c - UDP L2 vhost-user management functions > + * > + * Copyright Red Hat > + * Author: Laurent Vivier > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include "checksum.h" > +#include "util.h" > +#include "ip.h" > +#include "siphash.h" > +#include "inany.h" > +#include "passt.h" > +#include "pcap.h" > +#include "log.h" > +#include "vhost_user.h" > +#include "udp_internal.h" > +#include "flow.h" > +#include "flow_table.h" > +#include "udp_flow.h" > +#include "udp_vu.h" > +#include "vu_common.h" > + > +static struct iovec iov_vu [VIRTQUEUE_MAX_SIZE]; > +static struct vu_virtq_element elem [VIRTQUEUE_MAX_SIZE]; > +static struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; > +static int in_sg_count; > + > +/** > + * udp_vu_l2_hdrlen() - return the size of the header in level 2 frame (UDP) > + * @v6: Set for IPv6 packet > + * > + * Return: Return the size of the header > + */ > +static size_t udp_vu_l2_hdrlen(bool v6) > +{ > + size_t l2_hdrlen; > + > + l2_hdrlen = sizeof(struct virtio_net_hdr_mrg_rxbuf) + sizeof(struct ethhdr) + > + sizeof(struct udphdr); > + > + if (v6) > + l2_hdrlen += sizeof(struct ipv6hdr); > + else > + l2_hdrlen += sizeof(struct iphdr); > + > + return l2_hdrlen; > +} > + > +static int udp_vu_sock_init(int s, union sockaddr_inany *s_in) > +{ > + struct msghdr msg = { > + .msg_name = s_in, > + .msg_namelen = sizeof(union sockaddr_inany), > + }; > + > + return recvmsg(s, &msg, MSG_PEEK | MSG_DONTWAIT); > +} > + > +/** > + * udp_vu_sock_recv() - Receive datagrams from socket into vhost-user buffers > + * @c: Execution context > + * @s: Socket to receive from > + * @events: epoll events bitmap > + * @v6: Set for IPv6 connections > + * @datalen: Size of received data (output) > + * > + * Return: Number of iov entries used to store the datagram > + */ > +static int udp_vu_sock_recv(const struct ctx *c, int s, uint32_t events, > + bool v6, ssize_t *data_len) > +{ > + struct vu_dev *vdev = c->vdev; > + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > + int virtqueue_max, iov_cnt, idx, iov_used; > + size_t fillsize, size, off, l2_hdrlen; > + struct virtio_net_hdr_mrg_rxbuf *vh; > + struct msghdr msg = { 0 }; > + char *base; > + > + ASSERT(!c->no_udp); > + > + if (!(events & EPOLLIN)) > + return 0; > + > + /* compute L2 header length */ > + > + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) > + virtqueue_max = VIRTQUEUE_MAX_SIZE; > + else > + virtqueue_max = 1; > + > + l2_hdrlen = udp_vu_l2_hdrlen(v6); > + > + fillsize = USHRT_MAX; > + iov_cnt = 0; > + in_sg_count = 0; > + while (fillsize && iov_cnt < virtqueue_max && > + in_sg_count < ARRAY_SIZE(in_sg)) { > + int ret; > + > + elem[iov_cnt].out_num = 0; > + elem[iov_cnt].out_sg = NULL; > + elem[iov_cnt].in_num = ARRAY_SIZE(in_sg) - in_sg_count; > + elem[iov_cnt].in_sg = &in_sg[in_sg_count]; > + ret = vu_queue_pop(vdev, vq, &elem[iov_cnt]); > + if (ret < 0) > + break; > + in_sg_count += elem[iov_cnt].in_num; > + > + if (elem[iov_cnt].in_num < 1) { > + err("virtio-net receive queue contains no in buffers"); > + vu_queue_rewind(vq, iov_cnt); > + return 0; > + } > + ASSERT(elem[iov_cnt].in_num == 1); > + ASSERT(elem[iov_cnt].in_sg[0].iov_len >= l2_hdrlen); > + > + if (iov_cnt == 0) { > + base = elem[iov_cnt].in_sg[0].iov_base; > + size = elem[iov_cnt].in_sg[0].iov_len; > + > + /* keep space for the headers */ > + iov_vu[0].iov_base = base + l2_hdrlen; > + iov_vu[0].iov_len = size - l2_hdrlen; > + } else { > + iov_vu[iov_cnt].iov_base = elem[iov_cnt].in_sg[0].iov_base; > + iov_vu[iov_cnt].iov_len = elem[iov_cnt].in_sg[0].iov_len; > + } > + > + if (iov_vu[iov_cnt].iov_len > fillsize) > + iov_vu[iov_cnt].iov_len = fillsize; > + > + fillsize -= iov_vu[iov_cnt].iov_len; > + > + iov_cnt++; > + } > + if (iov_cnt == 0) > + return 0; > + > + msg.msg_iov = iov_vu; > + msg.msg_iovlen = iov_cnt; > + > + *data_len = recvmsg(s, &msg, 0); > + if (*data_len < 0) { > + vu_queue_rewind(vq, iov_cnt); > + return 0; > + } > + > + /* restore original values */ > + iov_vu[0].iov_base = base; > + iov_vu[0].iov_len = size; > + > + /* count the numbers of buffer filled by recvmsg() */ > + idx = iov_skip_bytes(iov_vu, iov_cnt, l2_hdrlen + *data_len, > + &off); > + /* adjust last iov length */ > + if (idx < iov_cnt) > + iov_vu[idx].iov_len = off; > + iov_used = idx + !!off; > + > + /* release unused buffers */ > + vu_queue_rewind(vq, iov_cnt - iov_used); > + > + vh = (struct virtio_net_hdr_mrg_rxbuf *)base; > + vh->hdr = VU_HEADER; > + if (vu_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) > + vh->num_buffers = htole16(iov_used); IIUC in the !VIRTIO_NET_F_MRG_RXBUF, we need the guest supplied buffers to be big enough to hold an entire datagram + headers. We should probably have a warning somewhere above if that's not the case, yes? And we need to make sure we drop the packet in that case, not truncate it. > + return iov_used; > +} > + > +/** > + * udp_vu_prepare() - Prepare the packet header > + * @c: Execution context > + * @toside: Address information for one side of the flow > + * @datalen: Packet data length > + * > + * Return:i Level-4 length > + */ > +static size_t udp_vu_prepare(const struct ctx *c, > + const struct flowside *toside, ssize_t data_len) > +{ > + struct ethhdr *eh; > + size_t l4len; > + > + /* ethernet header */ > + eh = vu_eth(iov_vu[0].iov_base); > + > + memcpy(eh->h_dest, c->guest_mac, sizeof(eh->h_dest)); > + memcpy(eh->h_source, c->our_tap_mac, sizeof(eh->h_source)); > + > + /* initialize header */ > + if (inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)) { > + struct iphdr *iph = vu_ip(iov_vu[0].iov_base); > + struct udp_payload_t *bp = vu_payloadv4(iov_vu[0].iov_base); > + > + eh->h_proto = htons(ETH_P_IP); > + > + *iph = (struct iphdr)L2_BUF_IP4_INIT(IPPROTO_UDP); > + > + l4len = udp_update_hdr4(iph, bp, toside, data_len, true); > + } else { > + struct ipv6hdr *ip6h = vu_ip(iov_vu[0].iov_base); > + struct udp_payload_t *bp = vu_payloadv6(iov_vu[0].iov_base); > + > + eh->h_proto = htons(ETH_P_IPV6); > + > + *ip6h = (struct ipv6hdr)L2_BUF_IP6_INIT(IPPROTO_UDP); > + > + l4len = udp_update_hdr6(ip6h, bp, toside, data_len, true); > + } > + > + return l4len; > +} > + > +/** > + * udp_vu_pcap() - Capture a single frame to pcap file (UDP) > + * @c: Execution context > + * @toside: ddress information for one side of the flow > + * @l4len: IPv4 Payload length > + * @iov_used: Length of the array > + */ > +static void udp_vu_pcap(const struct ctx *c, const struct flowside *toside, > + size_t l4len, int iov_used) > +{ > + const struct in_addr *src4 = inany_v4(&toside->oaddr); > + const struct in_addr *dst4 = inany_v4(&toside->eaddr); > + char *base = iov_vu[0].iov_base; > + size_t size = iov_vu[0].iov_len; > + struct udp_payload_t *bp; > + uint32_t sum; > + > + if (!*c->pcap) > + return; > + > + if (src4 && dst4) { > + bp = vu_payloadv4(base); > + sum = proto_ipv4_header_psum(l4len, IPPROTO_UDP, *src4, *dst4); > + } else { > + bp = vu_payloadv6(base); > + sum = proto_ipv6_header_psum(l4len, IPPROTO_UDP, > + &toside->oaddr.a6, > + &toside->eaddr.a6); > + bp->uh.check = 0; /* by default, set to 0xffff */ > + } > + > + iov_vu[0].iov_base = &bp->uh; > + iov_vu[0].iov_len = size - ((char *)iov_vu[0].iov_base - base); > + > + bp->uh.check = csum_iov(iov_vu, iov_used, sum); Similar comments here to the TCP case. > + /* set iov for pcap logging */ > + iov_vu[0].iov_base = base + sizeof(struct virtio_net_hdr_mrg_rxbuf); > + iov_vu[0].iov_len = size - sizeof(struct virtio_net_hdr_mrg_rxbuf); > + pcap_iov(iov_vu, iov_used); > + > + /* restore iov_vu[0] */ > + iov_vu[0].iov_base = base; > + iov_vu[0].iov_len = size; > +} > + > +/** > + * udp_vu_listen_sock_handler() - Handle new data from socket > + * @c: Execution context > + * @ref: epoll reference > + * @events: epoll events bitmap > + * @now: Current timestamp > + */ > +void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, > + uint32_t events, const struct timespec *now) > +{ > + struct vu_dev *vdev = c->vdev; > + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > + const struct flowside *toside; > + union sockaddr_inany s_in; > + flow_sidx_t batchsidx; > + uint8_t batchpif; > + bool v6; > + int i; > + > + if (udp_sock_errs(c, ref.fd, events) < 0) { > + err("UDP: Unrecoverable error on listening socket:" > + " (%s port %hu)", pif_name(ref.udp.pif), ref.udp.port); > + return; > + } > + > + if (udp_vu_sock_init(ref.fd, &s_in) < 0) > + return; Hrm, it would be nice if we could avoid this additional MSG_PEEK just to initialise the batch. In fact, I think this has to change somehow. In the loop below, you're assuming that everything belongs to the same flow, taken from this first packet. For a listening socket that might not be the case. You need to check the address on each datagram to see which flow it belongs to. > + batchsidx = udp_flow_from_sock(c, ref, &s_in, now); > + batchpif = pif_at_sidx(batchsidx); > + > + if (batchpif != PIF_TAP) { > + if (flow_sidx_valid(batchsidx)) { > + flow_sidx_t fromsidx = flow_sidx_opposite(batchsidx); > + struct udp_flow *uflow = udp_at_sidx(batchsidx); > + > + flow_err(uflow, > + "No support for forwarding UDP from %s to %s", > + pif_name(pif_at_sidx(fromsidx)), > + pif_name(batchpif)); > + } else { > + debug("Discarding 1 datagram without flow"); Ah.. except.. we haven't actually discarded the datagram. We've PEEKed it but never read it "for real". So we could start spinning on a flowless packet if we ever got one. > + } > + > + return; > + } > + > + toside = flowside_at_sidx(batchsidx); > + > + v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); > + > + for (i = 0; i < UDP_MAX_FRAMES; i++) { > + ssize_t data_len; > + size_t l4len; > + int iov_used; > + > + iov_used = udp_vu_sock_recv(c, ref.fd, events, v6, &data_len); > + if (iov_used <= 0) > + return; Pity we have to go packet by packet rather than using recvmmsg(). Although.. while that's the case, there's probably not any point to the "batch" stuff - that's only there so we can consolidate multiple packets together; really only useful for qemu socket, where it really can become a single big sendmsg(). So ditching that, you should be able to also avoid the MSG_PEEK, just doing the flow calculation for each packet separately. Uh.. except.. no, because you need to know if it's v4 or v6 before you can allocate the buffers for the recvmsg(). Ouch.. I'm not sure how to deal with that. There might also be a way to allow a recvmmsg() in at least some cases: you could look ahead in the vu queue to see how many buffers you can grab that are large enuogh to hold a max-size UDP packet. You could then recvmmsg() that many datagrams, one into each buffer. Still has the v4/v6 problem though. > + l4len = udp_vu_prepare(c, toside, data_len); > + udp_vu_pcap(c, toside, l4len, iov_used); > + vu_send_frame(vdev, vq, elem, iov_vu, iov_used); > + } > +} > + > +/** > + * udp_vu_reply_sock_handler() - Handle new data from flow specific socket > + * @c: Execution context > + * @ref: epoll reference > + * @events: epoll events bitmap > + * @now: Current timestamp > + */ > +void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, > + uint32_t events, const struct timespec *now) > +{ > + flow_sidx_t tosidx = flow_sidx_opposite(ref.flowside); > + const struct flowside *toside = flowside_at_sidx(tosidx); > + struct vu_dev *vdev = c->vdev; > + struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > + struct udp_flow *uflow = udp_at_sidx(ref.flowside); > + int from_s = uflow->s[ref.flowside.sidei]; > + uint8_t topif = pif_at_sidx(tosidx); > + bool v6; > + int i; > + > + ASSERT(!c->no_udp); > + ASSERT(uflow); > + > + if (udp_sock_errs(c, from_s, events) < 0) { > + flow_err(uflow, "Unrecoverable error on reply socket"); > + flow_err_details(uflow); > + udp_flow_close(c, uflow); > + return; > + } > + > + if (topif != PIF_TAP) { > + uint8_t frompif = pif_at_sidx(ref.flowside); > + > + flow_err(uflow, > + "No support for forwarding UDP from %s to %s", > + pif_name(frompif), pif_name(topif)); > + return; > + } > + > + v6 = !(inany_v4(&toside->eaddr) && inany_v4(&toside->oaddr)); > + > + for (i = 0; i < UDP_MAX_FRAMES; i++) { > + ssize_t data_len; > + size_t l4len; > + int iov_used; > + > + iov_used = udp_vu_sock_recv(c, from_s, events, v6, &data_len); > + if (iov_used <= 0) > + return; There's a subtle difference between the "buf" and vu logic that might bite us here. In buf we read the datagrams first, then if we can't fit them into the tap device we just discard them. It's UDP, that's fine. VU, necessarily, tries to grab buffers from the tap side before it even reads the datagrams. If it can't we'll abort here. But that means the datagrams are still queued on the socket side, which could lead to rolling epoll events. I guess there's a good chance we'll just manage to send them on the next cycle, but I wonder if for the benefit of flow control we should explicitly discard them instead. > + flow_trace(uflow, "Received 1 datagram on reply socket"); > + uflow->ts = now->tv_sec; > + > + l4len = udp_vu_prepare(c, toside, data_len); > + udp_vu_pcap(c, toside, l4len, iov_used); > + vu_send_frame(vdev, vq, elem, iov_vu, iov_used); > + } > +} > diff --git a/udp_vu.h b/udp_vu.h > new file mode 100644 > index 000000000000..ba7018d3bf01 > --- /dev/null > +++ b/udp_vu.h > @@ -0,0 +1,13 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* Copyright Red Hat > + * Author: Laurent Vivier > + */ > + > +#ifndef UDP_VU_H > +#define UDP_VU_H > + > +void udp_vu_listen_sock_handler(const struct ctx *c, union epoll_ref ref, > + uint32_t events, const struct timespec *now); > +void udp_vu_reply_sock_handler(const struct ctx *c, union epoll_ref ref, > + uint32_t events, const struct timespec *now); > +#endif /* UDP_VU_H */ > diff --git a/vhost_user.c b/vhost_user.c > index 3b38e06f268e..0f98ee7fa7c3 100644 > --- a/vhost_user.c > +++ b/vhost_user.c > @@ -52,7 +52,6 @@ > * this is part of the vhost-user backend > * convention. > */ > -/* cppcheck-suppress unusedFunction */ > void vu_print_capabilities(void) > { > info("{"); > @@ -162,9 +161,7 @@ static void vmsg_close_fds(const struct vhost_user_msg *vmsg) > */ > static void vu_remove_watch(const struct vu_dev *vdev, int fd) > { > - /* Placeholder to add passt related code */ > - (void)vdev; > - (void)fd; > + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_DEL, fd, NULL); > } > > /** > @@ -425,7 +422,6 @@ static bool map_ring(struct vu_dev *vdev, struct vu_virtq *vq) > * > * Return: 0 if the zone is in a mapped memory region, -1 otherwise > */ > -/* cppcheck-suppress unusedFunction */ > int vu_packet_check_range(void *buf, size_t offset, size_t len, > const char *start) > { > @@ -515,6 +511,14 @@ static bool vu_set_mem_table_exec(struct vu_dev *vdev, > } > } > > + /* As vu_packet_check_range() has no access to the number of > + * memory regions, mark the end of the array with mmap_addr = 0 > + */ > + ASSERT(vdev->nregions < VHOST_USER_MAX_RAM_SLOTS - 1); > + vdev->regions[vdev->nregions].mmap_addr = 0; > + > + tap_sock_update_buf(vdev->regions, 0); > + > return false; > } > > @@ -643,9 +647,12 @@ static bool vu_get_vring_base_exec(struct vu_dev *vdev, > */ > static void vu_set_watch(const struct vu_dev *vdev, int fd) > { > - /* Placeholder to add passt related code */ > - (void)vdev; > - (void)fd; > + union epoll_ref ref = { .type = EPOLL_TYPE_VHOST_KICK, .fd = fd }; > + struct epoll_event ev = { 0 }; > + > + ev.data.u64 = ref.u64; > + ev.events = EPOLLIN; > + epoll_ctl(vdev->context->epollfd, EPOLL_CTL_ADD, fd, &ev); > } > > /** > @@ -685,7 +692,6 @@ static int vu_wait_queue(const struct vu_virtq *vq) > * > * Return: number of bytes sent, -1 if there is an error > */ > -/* cppcheck-suppress unusedFunction */ > int vu_send(struct vu_dev *vdev, const void *buf, size_t size) > { > struct vu_virtq *vq = &vdev->vq[VHOST_USER_RX_QUEUE]; > @@ -869,7 +875,6 @@ static void vu_handle_tx(struct vu_dev *vdev, int index, > * @ref: epoll reference information > * @now: Current timestamp > */ > -/* cppcheck-suppress unusedFunction */ > void vu_kick_cb(struct vu_dev *vdev, union epoll_ref ref, > const struct timespec *now) > { > @@ -1104,11 +1109,11 @@ static bool vu_set_vring_enable_exec(struct vu_dev *vdev, > * @c: execution context > * @vdev: vhost-user device > */ > -/* cppcheck-suppress unusedFunction */ > void vu_init(struct ctx *c, struct vu_dev *vdev) > { > int i; > > + c->vdev = vdev; > vdev->context = c; > for (i = 0; i < VHOST_USER_MAX_QUEUES; i++) { > vdev->vq[i] = (struct vu_virtq){ > @@ -1124,7 +1129,6 @@ void vu_init(struct ctx *c, struct vu_dev *vdev) > * vu_cleanup() - Reset vhost-user device > * @vdev: vhost-user device > */ > -/* cppcheck-suppress unusedFunction */ > void vu_cleanup(struct vu_dev *vdev) > { > unsigned int i; > @@ -1171,8 +1175,7 @@ void vu_cleanup(struct vu_dev *vdev) > */ > static void vu_sock_reset(struct vu_dev *vdev) > { > - /* Placeholder to add passt related code */ > - (void)vdev; > + tap_sock_reset(vdev->context); > } > > static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, > @@ -1200,7 +1203,6 @@ static bool (*vu_handle[VHOST_USER_MAX])(struct vu_dev *vdev, > * @fd: vhost-user message socket > * @events: epoll events > */ > -/* cppcheck-suppress unusedFunction */ > void vu_control_handler(struct vu_dev *vdev, int fd, uint32_t events) > { > struct vhost_user_msg msg = { 0 }; > diff --git a/virtio.c b/virtio.c > index 237395396606..31e56def2c23 100644 > --- a/virtio.c > +++ b/virtio.c > @@ -562,7 +562,6 @@ void vu_queue_unpop(struct vu_virtq *vq) > * @vq: Virtqueue > * @num: Number of element to unpop > */ > -/* cppcheck-suppress unusedFunction */ > bool vu_queue_rewind(struct vu_virtq *vq, unsigned int num) > { > if (num > vq->inuse) > diff --git a/vu_common.c b/vu_common.c > new file mode 100644 > index 000000000000..7a9caae17f42 > --- /dev/null > +++ b/vu_common.c > @@ -0,0 +1,36 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* Copyright Red Hat > + * Author: Laurent Vivier > + * > + * common_vu.c - vhost-user common UDP and TCP functions > + */ > + > +#include > +#include > +#include > + > +#include "util.h" > +#include "passt.h" > +#include "vhost_user.h" > +#include "vu_common.h" > + > +/** > + * vu_send_frame() - Send one frame to the vhost-user interface Is it necessarily one frame? I thought it could be multiple, depending on what num_bufs values are set in the buffers. > + * @vdev: vhost-user device > + * @vq: vhost-user virtqueue > + * @elem: virtqueue element array to send back to the virqueue > + * @iov_vu: iovec array containing the data to send > + * @iov_used: Length of the array > + */ > +void vu_send_frame(const struct vu_dev *vdev, struct vu_virtq *vq, > + struct vu_virtq_element *elem, const struct iovec *iov_vu, > + int iov_used) > +{ > + int i; > + > + for (i = 0; i < iov_used; i++) > + vu_queue_fill(vq, &elem[i], iov_vu[i].iov_len, i); > + > + vu_queue_flush(vq, iov_used); > + vu_queue_notify(vdev, vq); > +} > diff --git a/vu_common.h b/vu_common.h > new file mode 100644 > index 000000000000..20950b44493c > --- /dev/null > +++ b/vu_common.h > @@ -0,0 +1,34 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later > + * Copyright Red Hat > + * Author: Laurent Vivier > + * > + * vhost-user common UDP and TCP functions > + */ > + > +#ifndef VU_COMMON_H > +#define VU_COMMON_H > + > +static inline void *vu_eth(void *base) > +{ > + return ((char *)base + sizeof(struct virtio_net_hdr_mrg_rxbuf)); > +} > + > +static inline void *vu_ip(void *base) > +{ > + return (struct ethhdr *)vu_eth(base) + 1; > +} > + > +static inline void *vu_payloadv4(void *base) > +{ > + return (struct iphdr *)vu_ip(base) + 1; > +} > + > +static inline void *vu_payloadv6(void *base) > +{ > + return (struct ipv6hdr *)vu_ip(base) + 1; > +} > + > +void vu_send_frame(const struct vu_dev *vdev, struct vu_virtq *vq, > + struct vu_virtq_element *elem, const struct iovec *iov_vu, > + int iov_used); > +#endif /* VU_COMMON_H */ -- David Gibson (he or they) | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you, not the other way | around. http://www.ozlabs.org/~dgibson