Hi, I compared perf result using this patch and a patch changing tap_send_frames_passt() to: static size_t tap_send_frames_passt(const struct ctx *c, const struct iovec *iov, size_t bufs_per_frame, size_t nframes) { struct msghdr mh = { .msg_iovlen = bufs_per_frame, }; size_t buf_offset; unsigned int i; ssize_t sent; for (i = 0; i < nframes; i++) { unsigned int j; if (bufs_per_frame > 1) { /* if we have more than 1 iovec, the first one is vnet_len */ uint32_t *p = iov[i * bufs_per_frame].iov_base; uint32_t vnet_len = 0; for (j = 1; j < bufs_per_frame; j++) vnet_len += iov[i * bufs_per_frame + j].iov_len; vnet_len = htonl(vnet_len); *p = vnet_len; } mh.msg_iov = (void *)&iov[i * bufs_per_frame]; sent = sendmsg(c->fd_tap, &mh, MSG_NOSIGNAL | MSG_DONTWAIT); if (sent < 0) return i; /* Check for any partial frames due to short send */ j = iov_skip_bytes(&iov[i * bufs_per_frame], bufs_per_frame, sent, &buf_offset); if (buf_offset && j < bufs_per_frame) { if (write_remainder(c->fd_tap, &iov[i * bufs_per_frame + j], bufs_per_frame - j, buf_offset) < 0) { err("tap: partial frame send: %s", strerror(errno)); return i; } } } return i; } And the result of 'perf record -e cache-misses' gives: slow 83.95% passt.avx2 passt.avx2 [.] csum_avx2 4.39% passt.avx2 passt.avx2 [.] tap4_handler 2.37% passt.avx2 libc.so.6 [.] __printf_buffer 0.84% passt.avx2 passt.avx2 [.] udp_timer fast 22.15% passt.avx2 passt.avx2 [.] csum_avx2 14.91% passt.avx2 passt.avx2 [.] udp_timer 7.60% passt.avx2 libc.so.6 [.] __printf_buffer 5.10% passt.avx2 passt.avx2 [.] ffsl Thanks, Laurent