public inbox for passt-dev@passt.top
 help / color / mirror / code / Atom feed
From: Eugenio Perez Martin <eperezma@redhat.com>
To: David Gibson <david@gibson.dropbear.id.au>
Cc: passt-dev@passt.top, jasowang@redhat.com
Subject: Re: [RFC v2 10/11] tap: add poll(2) to used_idx
Date: Mon, 28 Jul 2025 19:03:12 +0200	[thread overview]
Message-ID: <CAJaqyWdgJgDxkiGsGJjt6jbOxTLqQR=RPy+AgvywdRKuYS=djg@mail.gmail.com> (raw)
In-Reply-To: <aIGKbH3mzWTF6hDF@zatzit>

On Thu, Jul 24, 2025 at 3:21 AM David Gibson
<david@gibson.dropbear.id.au> wrote:
>
> On Wed, Jul 09, 2025 at 07:47:47PM +0200, Eugenio Pérez wrote:
> > From ~13Gbit/s to ~11.5Gbit/s.
>
> Again, I really don't know what you're comparing to what here.
>

When the buffer is full I'm using poll() to wait until vhost free some
buffers, instead of actively checking the used index. This is the cost
of the syscall.

We could mitigate some of this by making the tx queue larger though.

> >
> > TODO: Maybe we can reuse epoll for this, not needing to introduce a new
> > syscall.
>
> I really hope so.
>
> >
> > Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
> > ---
> >  tap.c     | 59 +++++++++++++++++++++++++++++++++++++++++++++----------
> >  tap.h     |  2 +-
> >  tcp_buf.c |  6 +++---
> >  3 files changed, 53 insertions(+), 14 deletions(-)
> >
> > diff --git a/tap.c b/tap.c
> > index 55357e3..93a8c12 100644
> > --- a/tap.c
> > +++ b/tap.c
> > @@ -19,6 +19,7 @@
> >  #include <stdio.h>
> >  #include <errno.h>
> >  #include <limits.h>
> > +#include <poll.h>
> >  #include <string.h>
> >  #include <net/ethernet.h>
> >  #include <net/if.h>
> > @@ -120,7 +121,7 @@ static PACKET_POOL_NOINIT(pool_tap6, TAP_MSGS_IP6, pkt_buf);
> >  #define VHOST_NDESCS (PKT_BUF_BYTES / 65520)
> >  static_assert(!(VHOST_NDESCS & (VHOST_NDESCS - 1)),
> >                        "Number of vhost descs must be a power of two by standard");
> > -static struct {
> > +static struct vhost_virtqueue {
> >       /* Descriptor index we're using. This is not the same as avail idx in
> >        * split: this takes into account the chained descs */
> >       uint16_t vring_idx;
> > @@ -472,26 +473,63 @@ static void vhost_kick(struct vring_used *used, int kick_fd) {
> >               eventfd_write(kick_fd, 1);
> >  }
> >
> > +/*
> > + * #syscalls:pasta read poll
> > + */
> > +static uint16_t used_idx(struct vhost_virtqueue *vq,
> > +                      struct vring_avail *avail,
> > +                      const struct vring_used *used, int pollfd)
> > +{
> > +     struct pollfd fds = { .fd = pollfd, .events = POLLIN };
> > +     int r;
> > +
> > +     if (vq->shadow_used_idx == vq->last_used_idx)
> > +             vq->shadow_used_idx = le16toh(used->idx);
> > +
> > +     if (vq->shadow_used_idx != vq->last_used_idx || pollfd < 0)
> > +             return vq->shadow_used_idx;
> > +
> > +     avail->flags &= ~htole16(1ULL << VRING_AVAIL_F_NO_INTERRUPT);
> > +     /* trusting syscall for smp_wb() */
> > +     r = read(pollfd, (uint64_t[]){0}, sizeof(uint64_t));
> > +     assert((r < 0 && errno == EAGAIN) || r == 8);
> > +
> > +     /* Another oportunity before syscalls */
> > +     vq->shadow_used_idx = le16toh(used->idx);
> > +     if (vq->shadow_used_idx != vq->last_used_idx) {
> > +             return vqs->shadow_used_idx;
> > +     }
> > +
> > +     r = poll(&fds, 1, -1);
> > +     assert (0 < r);
> > +     avail->flags |= htole16(1ULL << VRING_AVAIL_F_NO_INTERRUPT);
> > +     vq->shadow_used_idx = le16toh(used->idx);
> > +     return vq->shadow_used_idx;
>
> I don't understand what this is accomplishing.  It seems like it's
> blocking waiting for a buffer to be freed, which seems like exactly
> what we don't want to do.
>
> > +}
> > +
> >  /* n = target */
> > -void tap_free_old_xmit(size_t n)
> > +size_t tap_free_old_xmit(const struct ctx *c, size_t n)
> >  {
> >       size_t r = 0;
> > +     int pollfd = (n == (size_t)-1) ? -1 : c->vq[1].call_fd;
> >
> >       while (r < n) {
> > -             uint16_t used_idx = vqs[1].last_used_idx;
> > -             if (vqs[1].shadow_used_idx == used_idx) {
> > -                    vqs[1].shadow_used_idx = le16toh(*(volatile uint16_t*)&vring_used_1.used.idx);
> > -
> > -                    if (vqs[1].shadow_used_idx == used_idx)
> > -                            continue;
> > +             uint16_t last_used = vqs[1].last_used_idx;
> > +             if (used_idx(&vqs[1], &vring_avail_1.avail, &vring_used_1.used, pollfd) == last_used) {
> > +                     assert(pollfd == -1);
> > +                     return r;
> >               }
> >
> > +             /* Only get used array entries after they have been exposed by vhost. */
> > +             smp_rmb();
> >               /* assert in-order */
> > -             assert(vring_used_1.used.ring[used_idx % VHOST_NDESCS].id == vring_avail_1.avail.ring[used_idx % VHOST_NDESCS]);
> > -             vqs[1].num_free += vqs[1].ndescs[used_idx % VHOST_NDESCS];
> > +             assert(vring_used_1.used.ring[last_used % VHOST_NDESCS].id == vring_avail_1.avail.ring[last_used % VHOST_NDESCS]);
> > +             vqs[1].num_free += vqs[1].ndescs[last_used % VHOST_NDESCS];
> >               vqs[1].last_used_idx++;
> >               r++;
> >       }
> > +
> > +     return r;
> >  }
> >
> >  /**
> > @@ -1687,6 +1725,7 @@ static int tap_ns_tun(void *arg)
> >                       if (rc < 0)
> >                               die_perror("Failed to add call eventfd to epoll");
> >               }
> > +             fprintf(stderr, "[eperezma %s:%d][i=%d][call_fd=%d]\n", __func__, __LINE__, i, file.fd);
> >               c->vq[i].call_fd = file.fd;
> >
> >               file.fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
> > diff --git a/tap.h b/tap.h
> > index 7ca0fb0..7004116 100644
> > --- a/tap.h
> > +++ b/tap.h
> > @@ -112,7 +112,7 @@ void tap_icmp6_send(const struct ctx *c,
> >                   const struct in6_addr *src, const struct in6_addr *dst,
> >                   const void *in, size_t l4len);
> >  void tap_send_single(const struct ctx *c, const void *data, size_t l2len, bool vhost);
> > -void tap_free_old_xmit(size_t n);
> > +size_t tap_free_old_xmit(const struct ctx *c, size_t n);
> >  size_t tap_send_frames(const struct ctx *c, const struct iovec *iov,
> >                      size_t bufs_per_frame, size_t nframes, bool vhost);
> >  void eth_update_mac(struct ethhdr *eh,
> > diff --git a/tcp_buf.c b/tcp_buf.c
> > index 0437120..f74d22d 100644
> > --- a/tcp_buf.c
> > +++ b/tcp_buf.c
> > @@ -137,10 +137,10 @@ static void tcp_revert_seq(const struct ctx *c, struct tcp_tap_conn **conns,
> >       }
> >  }
> >
> > -static void tcp_buf_free_old_tap_xmit(void)
> > +static void tcp_buf_free_old_tap_xmit(const struct ctx *c)
> >  {
> >       while (tcp_payload_tap_used) {
> > -             tap_free_old_xmit(tcp_payload_tap_used);
> > +             tap_free_old_xmit(c, tcp_payload_tap_used);
> >
> >               tcp_payload_tap_used = 0;
> >               tcp_payload_sock_used = 0;
> > @@ -162,7 +162,7 @@ void tcp_payload_flush(const struct ctx *c)
> >                              tcp_payload_sock_used - m);
> >       }
> >       tcp_payload_tap_used += m;
> > -     tcp_buf_free_old_tap_xmit();
> > +     tcp_buf_free_old_tap_xmit(c);
> >  }
> >
> >  /**
>
> --
> David Gibson (he or they)       | I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au  | minimalist, thank you, not the other way
>                                 | around.
> http://www.ozlabs.org/~dgibson


  reply	other threads:[~2025-07-28 17:03 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-07-09 17:47 [RFC v2 00/11] Add vhost-net kernel support Eugenio Pérez
2025-07-09 17:47 ` [RFC v2 01/11] tap: implement vhost_call_cb Eugenio Pérez
2025-07-23  6:56   ` David Gibson
2025-07-28 16:33     ` Eugenio Perez Martin
2025-07-29  0:11       ` David Gibson
2025-07-09 17:47 ` [RFC v2 02/11] tap: add die() on vhost error Eugenio Pérez
2025-07-23  6:58   ` David Gibson
2025-07-09 17:47 ` [RFC v2 03/11] tap: replace tx tap hdr with virtio_nethdr_mrg_rxbuf Eugenio Pérez
2025-07-24  0:17   ` David Gibson
2025-07-28 16:37     ` Eugenio Perez Martin
2025-07-09 17:47 ` [RFC v2 04/11] tcp: export memory regions to vhost Eugenio Pérez
2025-07-23  7:06   ` David Gibson
2025-07-28 16:41     ` Eugenio Perez Martin
2025-07-29  0:25       ` David Gibson
2025-07-09 17:47 ` [RFC v2 05/11] virtio: Fill .next in tx queue Eugenio Pérez
2025-07-23  7:07   ` David Gibson
2025-07-28 16:44     ` Eugenio Perez Martin
2025-07-09 17:47 ` [RFC v2 06/11] tap: move static iov_sock to tcp_buf_data_from_sock Eugenio Pérez
2025-07-23  7:09   ` David Gibson
2025-07-28 16:43     ` Eugenio Perez Martin
2025-07-29  0:28       ` David Gibson
2025-07-09 17:47 ` [RFC v2 07/11] tap: support tx through vhost Eugenio Pérez
2025-07-24  0:24   ` David Gibson
2025-07-24 14:30     ` Stefano Brivio
2025-07-25  0:23       ` David Gibson
2025-07-09 17:47 ` [RFC v2 08/11] tap: add tap_free_old_xmit Eugenio Pérez
2025-07-24  0:32   ` David Gibson
2025-07-28 16:45     ` Eugenio Perez Martin
2025-07-09 17:47 ` [RFC v2 09/11] tcp: start conversion to circular buffer Eugenio Pérez
2025-07-24  1:03   ` David Gibson
2025-07-28 16:55     ` Eugenio Perez Martin
2025-07-29  0:30       ` David Gibson
2025-07-09 17:47 ` [RFC v2 10/11] tap: add poll(2) to used_idx Eugenio Pérez
2025-07-24  1:20   ` David Gibson
2025-07-28 17:03     ` Eugenio Perez Martin [this message]
2025-07-29  0:32       ` David Gibson
2025-07-29  7:04         ` Eugenio Perez Martin
2025-07-30  0:32           ` David Gibson
2025-07-09 17:47 ` [RFC v2 11/11] tcp_buf: adding TCP tx circular buffer Eugenio Pérez
2025-07-24  1:33   ` David Gibson
2025-07-28 17:04     ` Eugenio Perez Martin
2025-07-10  9:46 ` [RFC v2 00/11] Add vhost-net kernel support Eugenio Perez Martin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAJaqyWdgJgDxkiGsGJjt6jbOxTLqQR=RPy+AgvywdRKuYS=djg@mail.gmail.com' \
    --to=eperezma@redhat.com \
    --cc=david@gibson.dropbear.id.au \
    --cc=jasowang@redhat.com \
    --cc=passt-dev@passt.top \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://passt.top/passt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).