// bench_checksum.c // Run from the source directory: // gcc -O3 -D_GNU_SOURCE -mcpu=power9 -mvsx -maltivec \ // -I . -o /tmp/bench_vsx bench_checksum.c checksum.c // // gcc -O3 -D_GNU_SOURCE -mcpu=power9 -mno-vsx -mno-altivec -fno-tree-vectorize \ // -U__POWER8_VECTOR__ -U__POWER9_VECTOR__ \ // -I . -o /tmp/bench_scalar bench_checksum.c checksum.c #include #include #include #include #include "checksum.h" static double now_sec(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9; } int main(void) { static const size_t sizes[] = { 64, 256, 1500, 16 * 1024, 64 * 1024 }; const size_t target_bytes = 512UL * 1024UL * 1024UL; /* 512 MiB per size */ uint8_t *buf = NULL; size_t i; volatile uint32_t sink = 0; if (posix_memalign((void **)&buf, 64, sizes[sizeof(sizes) / sizeof(sizes[0]) - 1])) { perror("posix_memalign"); return 1; } for (i = 0; i < sizes[sizeof(sizes) / sizeof(sizes[0]) - 1]; i++) buf[i] = (uint8_t)(i * 131u + 7u); printf("len, iters, seconds, GBps, checksum\n"); for (i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) { size_t len = sizes[i]; size_t iters = target_bytes / len; size_t warm = iters / 10; double start, end, secs, gbps; if (iters < 10000) iters = 10000; if (warm < 1000) warm = 1000; for (size_t j = 0; j < warm; j++) sink ^= csum_unfolded(buf, len, 0); start = now_sec(); for (size_t j = 0; j < iters; j++) sink ^= csum_unfolded(buf, len, 0); end = now_sec(); secs = end - start; gbps = ((double)len * (double)iters) / (secs * 1e9); printf("%zu, %zu, %.6f, %.3f, 0x%08x\n", len, iters, secs, gbps, sink); } free((void *)buf); return 0; } On Tuesday, February 10th, 2026 at 1:37 PM, Cédric Le Goater wrote: > Hi, > > On 2/7/26 23:31, jfiusdq wrote: > > Microbenchmark of the checksum function vs C version at different buffer sizes: > > > > > > Results (GB/s, higher is better; speedup = VSX / scalar): > > > > 64B: VSX 4.61 vs scalar 5.91 -> 0.78x (VSX slower for tiny buffers) > > 256B: VSX 10.91 vs scalar 7.57 -> 1.44x > > 1500B: VSX 13.88 vs scalar 6.89 -> 2.02x > > 16KB: VSX 14.53 vs scalar 6.96 -> 2.09x > > 64KB: VSX 15.15 vs scalar 6.85 -> 2.21x > > Could you please share Microbenchmark ? > > Thanks, > > C. > > > > On Friday, February 6th, 2026 at 3:17 PM, Laurent Vivier wrote: > > > >> > > > >> > > > >> On Thu, 05 Feb 2026 06:14:40 +0000, jfiusdq jfiusdq@proton.me wrote: > >> > > > >>> Tested with podman on Debian 13 for a while and works ok. It's > >>> difficult to run all the tests on POWER but 505-networking-pasta.bats > >>> test suite passes. > >>> --- > >>> checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > >>> 1 file changed, 107 insertions(+), 3 deletions(-) > >>> > > > >>> diff --git a/checksum.c b/checksum.c > >>> index 0c3837c..828f9ec 100644 > >>> --- a/checksum.c > >>> +++ b/checksum.c > >>> @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, > >>> icmp6hr->icmp6_cksum = csum(payload, dlen, psum); > >>> } > >>> > > > >>> -#ifdef AVX2 > >>> +#if defined(AVX2) > >>> #include > >>> > > > >>> /** > >>> @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) > >>> > > > >>> return init; > >>> } > >>> -#else /* AVX2 / > >>> +#elif defined(POWER9_VECTOR) || defined(POWER8_VECTOR) > >>> +#include > >>> + > >>> +/* > >>> + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions > >>> + * @buf: Input buffer > >>> + * @len: Input length > >>> + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > >>> + * > >>> + * Return: 32-bit checksum, not complemented, not folded > >>> + / > >>> +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / > >>> +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() */ > >>> +static uint32_t csum_vsx(const void buf, size_t len, uint32_t init) > >>> +{ > >>> + const uint8_t p = buf; > >>> + vector unsigned int sum_even = vec_splat_u32(0); > >>> + vector unsigned int sum_odd = vec_splat_u32(0); > >>> + const vector unsigned short ones = vec_splat_u16(1); > >>> + uint64_t sum64 = init; > >>> + > >>> +#ifdef POWER9_VECTOR > >>> + while (len >= 64) { > >>> + vector unsigned char v0b = vec_vsx_ld(0, p); > >>> + vector unsigned char v1b = vec_vsx_ld(16, p); > >>> + vector unsigned char v2b = vec_vsx_ld(32, p); > >>> + vector unsigned char v3b = vec_vsx_ld(48, p); > >>> + vector unsigned short v0 = (vector unsigned short)v0b; > >>> + vector unsigned short v1 = (vector unsigned short)v1b; > >>> + vector unsigned short v2 = (vector unsigned short)v2b; > >>> + vector unsigned short v3 = (vector unsigned short)v3b; > >>> + > >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > >>> + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > >>> + sum_even = vec_add(sum_even, vec_mule(v2, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); > >>> + sum_even = vec_add(sum_even, vec_mule(v3, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); > >>> + > >>> + p += 64; > >>> + len -= 64; > >>> + } > >>> +#endif > >>> + > >>> + while (len >= 32) { > >>> + vector unsigned char v0b = vec_vsx_ld(0, p); > >>> + vector unsigned char v1b = vec_vsx_ld(16, p); > >>> + vector unsigned short v0 = (vector unsigned short)v0b; > >>> + vector unsigned short v1 = (vector unsigned short)v1b; > >>> + > >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > >>> + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > >>> + > >>> + p += 32; > >>> + len -= 32; > >>> + } > >>> + > >>> + while (len >= 16) { > >>> + vector unsigned char v0b = vec_vsx_ld(0, p); > >>> + vector unsigned short v0 = (vector unsigned short)v0b; > >>> + > >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > >>> + > >>> + p += 16; > >>> + len -= 16; > >>> + } > >>> + > >>> + { > >>> + vector unsigned int sum32 = vec_add(sum_even, sum_odd); > >>> + uint32_t partial[4] attribute((aligned(16))); > >>> + > >>> + vec_st(sum32, 0, partial); > >>> + sum64 += (uint64_t)partial[0] + partial[1] + > >>> + partial[2] + partial[3]; > >>> + } > >>> + > >>> + sum64 += sum_16b(p, len); > >>> + > >>> + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); > >>> + sum64 += sum64 >> 32; > >>> + > >>> + return (uint32_t)sum64; > >>> +} > >>> + > >>> +/ > >>> + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > >>> + * > >>> + * @buf: Input buffer > >>> + * @len: Input length > >>> + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > >>> + * > >>> + * Return: 32-bit unfolded checksum > >>> + / > >>> +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / > >>> +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() / > >>> +uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) > >>> +{ > >>> + return csum_vsx(buf, len, init); > >>> +} > >>> +#else / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR / > >>> / > >>> * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > >>> * > >>> @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) > >>> { > >>> return sum_16b(buf, len) + init; > >>> } > >>> -#endif / !AVX2 / > >>> +#endif / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR */ > >>> > > > >>> /** > >>> * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector > >>> -- > >>> 2.52.0 > >> > > > >> > > > >> Reviewed-by: Laurent Vivier lvivier@redhat.com > >