Microbenchmark of the checksum function vs C version at different buffer sizes: Results (GB/s, higher is better; speedup = VSX / scalar): 64B: VSX 4.61 vs scalar 5.91 -> 0.78x (VSX slower for tiny buffers) 256B: VSX 10.91 vs scalar 7.57 -> 1.44x 1500B: VSX 13.88 vs scalar 6.89 -> 2.02x 16KB: VSX 14.53 vs scalar 6.96 -> 2.09x 64KB: VSX 15.15 vs scalar 6.85 -> 2.21x On Friday, February 6th, 2026 at 3:17 PM, Laurent Vivier wrote: > > > On Thu, 05 Feb 2026 06:14:40 +0000, jfiusdq jfiusdq@proton.me wrote: > > > Tested with podman on Debian 13 for a while and works ok. It's > > difficult to run all the tests on POWER but 505-networking-pasta.bats > > test suite passes. > > --- > > checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > > 1 file changed, 107 insertions(+), 3 deletions(-) > > > > diff --git a/checksum.c b/checksum.c > > index 0c3837c..828f9ec 100644 > > --- a/checksum.c > > +++ b/checksum.c > > @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, > > icmp6hr->icmp6_cksum = csum(payload, dlen, psum); > > } > > > > -#ifdef AVX2 > > +#if defined(AVX2) > > #include > > > > /** > > @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) > > > > return init; > > } > > -#else /* AVX2 / > > +#elif defined(POWER9_VECTOR) || defined(POWER8_VECTOR) > > +#include > > + > > +/* > > + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions > > + * @buf: Input buffer > > + * @len: Input length > > + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > > + * > > + * Return: 32-bit checksum, not complemented, not folded > > + / > > +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / > > +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() */ > > +static uint32_t csum_vsx(const void buf, size_t len, uint32_t init) > > +{ > > + const uint8_t p = buf; > > + vector unsigned int sum_even = vec_splat_u32(0); > > + vector unsigned int sum_odd = vec_splat_u32(0); > > + const vector unsigned short ones = vec_splat_u16(1); > > + uint64_t sum64 = init; > > + > > +#ifdef POWER9_VECTOR > > + while (len >= 64) { > > + vector unsigned char v0b = vec_vsx_ld(0, p); > > + vector unsigned char v1b = vec_vsx_ld(16, p); > > + vector unsigned char v2b = vec_vsx_ld(32, p); > > + vector unsigned char v3b = vec_vsx_ld(48, p); > > + vector unsigned short v0 = (vector unsigned short)v0b; > > + vector unsigned short v1 = (vector unsigned short)v1b; > > + vector unsigned short v2 = (vector unsigned short)v2b; > > + vector unsigned short v3 = (vector unsigned short)v3b; > > + > > + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > > + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > > + sum_even = vec_add(sum_even, vec_mule(v2, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); > > + sum_even = vec_add(sum_even, vec_mule(v3, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); > > + > > + p += 64; > > + len -= 64; > > + } > > +#endif > > + > > + while (len >= 32) { > > + vector unsigned char v0b = vec_vsx_ld(0, p); > > + vector unsigned char v1b = vec_vsx_ld(16, p); > > + vector unsigned short v0 = (vector unsigned short)v0b; > > + vector unsigned short v1 = (vector unsigned short)v1b; > > + > > + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > > + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > > + > > + p += 32; > > + len -= 32; > > + } > > + > > + while (len >= 16) { > > + vector unsigned char v0b = vec_vsx_ld(0, p); > > + vector unsigned short v0 = (vector unsigned short)v0b; > > + > > + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > > + > > + p += 16; > > + len -= 16; > > + } > > + > > + { > > + vector unsigned int sum32 = vec_add(sum_even, sum_odd); > > + uint32_t partial[4] attribute((aligned(16))); > > + > > + vec_st(sum32, 0, partial); > > + sum64 += (uint64_t)partial[0] + partial[1] + > > + partial[2] + partial[3]; > > + } > > + > > + sum64 += sum_16b(p, len); > > + > > + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); > > + sum64 += sum64 >> 32; > > + > > + return (uint32_t)sum64; > > +} > > + > > +/ > > + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > > + * > > + * @buf: Input buffer > > + * @len: Input length > > + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > > + * > > + * Return: 32-bit unfolded checksum > > + / > > +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / > > +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() / > > +uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) > > +{ > > + return csum_vsx(buf, len, init); > > +} > > +#else / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR / > > / > > * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > > * > > @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) > > { > > return sum_16b(buf, len) + init; > > } > > -#endif / !AVX2 / > > +#endif / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR */ > > > > /** > > * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector > > -- > > 2.52.0 > > > Reviewed-by: Laurent Vivier lvivier@redhat.com