From: jfiusdq <jfiusdq@proton.me>
To: "Cédric Le Goater" <clg@kaod.org>
Cc: Laurent Vivier <lvivier@redhat.com>, passt-dev@passt.top
Subject: Re: checksum: add VSX fast path for POWER8/POWER9
Date: Wed, 11 Feb 2026 11:55:22 +0000 [thread overview]
Message-ID: <tgTA0kvk7kcj3hnj-7Uf5x14xfl1-sMp4gaOoy8sLDRLpTssiRPmAlX2Z5wbcdduj_ezsmTMhlYcI4fbgQ1W35R4IBLko0VQF9Xu6M_A8S8=@proton.me> (raw)
In-Reply-To: <d48a8f3c-e5c1-4c82-97ed-caa05473d1a0@kaod.org>
[-- Attachment #1.1: Type: text/plain, Size: 8085 bytes --]
// bench_checksum.c
// Run from the source directory:
// gcc -O3 -D_GNU_SOURCE -mcpu=power9 -mvsx -maltivec \
// -I . -o /tmp/bench_vsx bench_checksum.c checksum.c
//
// gcc -O3 -D_GNU_SOURCE -mcpu=power9 -mno-vsx -mno-altivec -fno-tree-vectorize \
// -U__POWER8_VECTOR__ -U__POWER9_VECTOR__ \
// -I . -o /tmp/bench_scalar bench_checksum.c checksum.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include "checksum.h"
static double now_sec(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9;
}
int main(void)
{
static const size_t sizes[] = { 64, 256, 1500, 16 * 1024, 64 * 1024 };
const size_t target_bytes = 512UL * 1024UL * 1024UL; /* 512 MiB per size */
uint8_t *buf = NULL;
size_t i;
volatile uint32_t sink = 0;
if (posix_memalign((void **)&buf, 64,
sizes[sizeof(sizes) / sizeof(sizes[0]) - 1])) {
perror("posix_memalign");
return 1;
}
for (i = 0; i < sizes[sizeof(sizes) / sizeof(sizes[0]) - 1]; i++)
buf[i] = (uint8_t)(i * 131u + 7u);
printf("len, iters, seconds, GBps, checksum\n");
for (i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) {
size_t len = sizes[i];
size_t iters = target_bytes / len;
size_t warm = iters / 10;
double start, end, secs, gbps;
if (iters < 10000)
iters = 10000;
if (warm < 1000)
warm = 1000;
for (size_t j = 0; j < warm; j++)
sink ^= csum_unfolded(buf, len, 0);
start = now_sec();
for (size_t j = 0; j < iters; j++)
sink ^= csum_unfolded(buf, len, 0);
end = now_sec();
secs = end - start;
gbps = ((double)len * (double)iters) / (secs * 1e9);
printf("%zu, %zu, %.6f, %.3f, 0x%08x\n", len, iters, secs, gbps, sink);
}
free((void *)buf);
return 0;
}
On Tuesday, February 10th, 2026 at 1:37 PM, Cédric Le Goater <clg@kaod.org> wrote:
> Hi,
>
> On 2/7/26 23:31, jfiusdq wrote:
> > Microbenchmark of the checksum function vs C version at different buffer sizes:
> >
> >
> > Results (GB/s, higher is better; speedup = VSX / scalar):
> >
> > 64B: VSX 4.61 vs scalar 5.91 -> 0.78x (VSX slower for tiny buffers)
> > 256B: VSX 10.91 vs scalar 7.57 -> 1.44x
> > 1500B: VSX 13.88 vs scalar 6.89 -> 2.02x
> > 16KB: VSX 14.53 vs scalar 6.96 -> 2.09x
> > 64KB: VSX 15.15 vs scalar 6.85 -> 2.21x
>
> Could you please share Microbenchmark ?
>
> Thanks,
>
> C.
>
>
> > On Friday, February 6th, 2026 at 3:17 PM, Laurent Vivier <lvivier@redhat.com> wrote:
> >
> >>
> >
> >>
> >
> >> On Thu, 05 Feb 2026 06:14:40 +0000, jfiusdq jfiusdq@proton.me wrote:
> >>
> >
> >>> Tested with podman on Debian 13 for a while and works ok. It's
> >>> difficult to run all the tests on POWER but 505-networking-pasta.bats
> >>> test suite passes.
> >>> ---
> >>> checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> >>> 1 file changed, 107 insertions(+), 3 deletions(-)
> >>>
> >
> >>> diff --git a/checksum.c b/checksum.c
> >>> index 0c3837c..828f9ec 100644
> >>> --- a/checksum.c
> >>> +++ b/checksum.c
> >>> @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr,
> >>> icmp6hr->icmp6_cksum = csum(payload, dlen, psum);
> >>> }
> >>>
> >
> >>> -#ifdef AVX2
> >>> +#if defined(AVX2)
> >>> #include <immintrin.h>
> >>>
> >
> >>> /**
> >>> @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
> >>>
> >
> >>> return init;
> >>> }
> >>> -#else /* AVX2 /
> >>> +#elif defined(POWER9_VECTOR) || defined(POWER8_VECTOR)
> >>> +#include <altivec.h>
> >>> +
> >>> +/*
> >>> + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions
> >>> + * @buf: Input buffer
> >>> + * @len: Input length
> >>> + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum
> >>> + *
> >>> + * Return: 32-bit checksum, not complemented, not folded
> >>> + /
> >>> +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) /
> >>> +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() */
> >>> +static uint32_t csum_vsx(const void buf, size_t len, uint32_t init)
> >>> +{
> >>> + const uint8_t p = buf;
> >>> + vector unsigned int sum_even = vec_splat_u32(0);
> >>> + vector unsigned int sum_odd = vec_splat_u32(0);
> >>> + const vector unsigned short ones = vec_splat_u16(1);
> >>> + uint64_t sum64 = init;
> >>> +
> >>> +#ifdef POWER9_VECTOR
> >>> + while (len >= 64) {
> >>> + vector unsigned char v0b = vec_vsx_ld(0, p);
> >>> + vector unsigned char v1b = vec_vsx_ld(16, p);
> >>> + vector unsigned char v2b = vec_vsx_ld(32, p);
> >>> + vector unsigned char v3b = vec_vsx_ld(48, p);
> >>> + vector unsigned short v0 = (vector unsigned short)v0b;
> >>> + vector unsigned short v1 = (vector unsigned short)v1b;
> >>> + vector unsigned short v2 = (vector unsigned short)v2b;
> >>> + vector unsigned short v3 = (vector unsigned short)v3b;
> >>> +
> >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones));
> >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones));
> >>> + sum_even = vec_add(sum_even, vec_mule(v1, ones));
> >>> + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones));
> >>> + sum_even = vec_add(sum_even, vec_mule(v2, ones));
> >>> + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones));
> >>> + sum_even = vec_add(sum_even, vec_mule(v3, ones));
> >>> + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones));
> >>> +
> >>> + p += 64;
> >>> + len -= 64;
> >>> + }
> >>> +#endif
> >>> +
> >>> + while (len >= 32) {
> >>> + vector unsigned char v0b = vec_vsx_ld(0, p);
> >>> + vector unsigned char v1b = vec_vsx_ld(16, p);
> >>> + vector unsigned short v0 = (vector unsigned short)v0b;
> >>> + vector unsigned short v1 = (vector unsigned short)v1b;
> >>> +
> >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones));
> >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones));
> >>> + sum_even = vec_add(sum_even, vec_mule(v1, ones));
> >>> + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones));
> >>> +
> >>> + p += 32;
> >>> + len -= 32;
> >>> + }
> >>> +
> >>> + while (len >= 16) {
> >>> + vector unsigned char v0b = vec_vsx_ld(0, p);
> >>> + vector unsigned short v0 = (vector unsigned short)v0b;
> >>> +
> >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones));
> >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones));
> >>> +
> >>> + p += 16;
> >>> + len -= 16;
> >>> + }
> >>> +
> >>> + {
> >>> + vector unsigned int sum32 = vec_add(sum_even, sum_odd);
> >>> + uint32_t partial[4] attribute((aligned(16)));
> >>> +
> >>> + vec_st(sum32, 0, partial);
> >>> + sum64 += (uint64_t)partial[0] + partial[1] +
> >>> + partial[2] + partial[3];
> >>> + }
> >>> +
> >>> + sum64 += sum_16b(p, len);
> >>> +
> >>> + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff);
> >>> + sum64 += sum64 >> 32;
> >>> +
> >>> + return (uint32_t)sum64;
> >>> +}
> >>> +
> >>> +/
> >>> + * csum_unfolded() - Calculate the unfolded checksum of a data buffer.
> >>> + *
> >>> + * @buf: Input buffer
> >>> + * @len: Input length
> >>> + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum
> >>> + *
> >>> + * Return: 32-bit unfolded checksum
> >>> + /
> >>> +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) /
> >>> +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() /
> >>> +uint32_t csum_unfolded(const void buf, size_t len, uint32_t init)
> >>> +{
> >>> + return csum_vsx(buf, len, init);
> >>> +}
> >>> +#else / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR /
> >>> /
> >>> * csum_unfolded() - Calculate the unfolded checksum of a data buffer.
> >>> *
> >>> @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void buf, size_t len, uint32_t init)
> >>> {
> >>> return sum_16b(buf, len) + init;
> >>> }
> >>> -#endif / !AVX2 /
> >>> +#endif / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR */
> >>>
> >
> >>> /**
> >>> * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector
> >>> --
> >>> 2.52.0
> >>
> >
> >>
> >
> >> Reviewed-by: Laurent Vivier lvivier@redhat.com
>
>
[-- Attachment #1.2: publickey - jfiusdq@proton.me - 0x344F580A.asc --]
[-- Type: application/pgp-keys, Size: 832 bytes --]
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 343 bytes --]
prev parent reply other threads:[~2026-02-11 11:55 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-02-05 6:14 jfiusdq
2026-02-05 18:03 ` Stefano Brivio
2026-02-06 15:17 ` Laurent Vivier
2026-02-07 22:31 ` jfiusdq
2026-02-10 13:36 ` Cédric Le Goater
2026-02-11 11:55 ` jfiusdq [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='tgTA0kvk7kcj3hnj-7Uf5x14xfl1-sMp4gaOoy8sLDRLpTssiRPmAlX2Z5wbcdduj_ezsmTMhlYcI4fbgQ1W35R4IBLko0VQF9Xu6M_A8S8=@proton.me' \
--to=jfiusdq@proton.me \
--cc=clg@kaod.org \
--cc=lvivier@redhat.com \
--cc=passt-dev@passt.top \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://passt.top/passt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for IMAP folder(s).