* checksum: add VSX fast path for POWER8/POWER9 @ 2026-02-05 6:14 jfiusdq 2026-02-05 18:03 ` Stefano Brivio 2026-02-06 15:17 ` Laurent Vivier 0 siblings, 2 replies; 6+ messages in thread From: jfiusdq @ 2026-02-05 6:14 UTC (permalink / raw) To: passt-dev [-- Attachment #1: Type: text/plain, Size: 152 bytes --] Tested with podman on Debian 13 for a while and works ok. It's difficult to run all the tests on POWER but 505-networking-pasta.bats test suite passes. [-- Attachment #2: 0001-checksum-add-VSX-fast-path-for-POWER8-POWER9.patch --] [-- Type: text/x-patch, Size: 4506 bytes --] >From b431f05c171a1c9b63ca8a57001188f782354a6a Mon Sep 17 00:00:00 2001 From: user <user@localhost> Date: Thu, 5 Feb 2026 06:36:27 +0100 Subject: [PATCH] checksum: add VSX fast path for POWER8/POWER9 --- checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 3 deletions(-) diff --git a/checksum.c b/checksum.c index 0c3837c..828f9ec 100644 --- a/checksum.c +++ b/checksum.c @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, icmp6hr->icmp6_cksum = csum(payload, dlen, psum); } -#ifdef __AVX2__ +#if defined(__AVX2__) #include <immintrin.h> /** @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) return init; } -#else /* __AVX2__ */ +#elif defined(__POWER9_VECTOR__) || defined(__POWER8_VECTOR__) +#include <altivec.h> + +/** + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit checksum, not complemented, not folded + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ +static uint32_t csum_vsx(const void *buf, size_t len, uint32_t init) +{ + const uint8_t *p = buf; + vector unsigned int sum_even = vec_splat_u32(0); + vector unsigned int sum_odd = vec_splat_u32(0); + const vector unsigned short ones = vec_splat_u16(1); + uint64_t sum64 = init; + +#ifdef __POWER9_VECTOR__ + while (len >= 64) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned char v1b = vec_vsx_ld(16, p); + vector unsigned char v2b = vec_vsx_ld(32, p); + vector unsigned char v3b = vec_vsx_ld(48, p); + vector unsigned short v0 = (vector unsigned short)v0b; + vector unsigned short v1 = (vector unsigned short)v1b; + vector unsigned short v2 = (vector unsigned short)v2b; + vector unsigned short v3 = (vector unsigned short)v3b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + sum_even = vec_add(sum_even, vec_mule(v1, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); + sum_even = vec_add(sum_even, vec_mule(v2, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); + sum_even = vec_add(sum_even, vec_mule(v3, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); + + p += 64; + len -= 64; + } +#endif + + while (len >= 32) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned char v1b = vec_vsx_ld(16, p); + vector unsigned short v0 = (vector unsigned short)v0b; + vector unsigned short v1 = (vector unsigned short)v1b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + sum_even = vec_add(sum_even, vec_mule(v1, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); + + p += 32; + len -= 32; + } + + while (len >= 16) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned short v0 = (vector unsigned short)v0b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + + p += 16; + len -= 16; + } + + { + vector unsigned int sum32 = vec_add(sum_even, sum_odd); + uint32_t partial[4] __attribute__((aligned(16))); + + vec_st(sum32, 0, partial); + sum64 += (uint64_t)partial[0] + partial[1] + + partial[2] + partial[3]; + } + + sum64 += sum_16b(p, len); + + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); + sum64 += sum64 >> 32; + + return (uint32_t)sum64; +} + +/** + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. + * + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit unfolded checksum + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ +uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) +{ + return csum_vsx(buf, len, init); +} +#else /* !__AVX2__ && !__POWER9_VECTOR__ && !__POWER8_VECTOR__ */ /** * csum_unfolded() - Calculate the unfolded checksum of a data buffer. * @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) { return sum_16b(buf, len) + init; } -#endif /* !__AVX2__ */ +#endif /* !__AVX2__ && !__POWER9_VECTOR__ && !__POWER8_VECTOR__ */ /** * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector -- 2.52.0 ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: checksum: add VSX fast path for POWER8/POWER9 2026-02-05 6:14 checksum: add VSX fast path for POWER8/POWER9 jfiusdq @ 2026-02-05 18:03 ` Stefano Brivio 2026-02-06 15:17 ` Laurent Vivier 1 sibling, 0 replies; 6+ messages in thread From: Stefano Brivio @ 2026-02-05 18:03 UTC (permalink / raw) To: jfiusdq; +Cc: passt-dev Hi jfiusdq, On Thu, 05 Feb 2026 06:14:40 +0000 jfiusdq <jfiusdq@proton.me> wrote: > Tested with podman on Debian 13 for a while and works ok. It's difficult to run all the tests on POWER but 505-networking-pasta.bats test suite passes. Thanks for the patch! I'm not really familiar with AltiVec / VSX or POWER at all so it's difficult for me to review this, but we have somebody on the list who should be able to help. :) It might need a bit of time though. Meanwhile, it would be nice if you could send this patch in the usual format, using git send-email and adding a Signed-off-by: tag. This is just the same submission format as the Linux kernel and many other opensource projects, see the archives for examples: https://archives.passt.top/passt-dev/ I understand you might not want to reveal your full name, and that's entirely fine, but still it would be better if you could send the patch in the usual format. We'll accept patches regardless of the submission format though, so that's not a strict requirement, just a nice-to-have. Thanks. -- Stefano ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: checksum: add VSX fast path for POWER8/POWER9 2026-02-05 6:14 checksum: add VSX fast path for POWER8/POWER9 jfiusdq 2026-02-05 18:03 ` Stefano Brivio @ 2026-02-06 15:17 ` Laurent Vivier 2026-02-07 22:31 ` jfiusdq 1 sibling, 1 reply; 6+ messages in thread From: Laurent Vivier @ 2026-02-06 15:17 UTC (permalink / raw) To: jfiusdq, passt-dev On Thu, 05 Feb 2026 06:14:40 +0000, jfiusdq <jfiusdq@proton.me> wrote: > Tested with podman on Debian 13 for a while and works ok. It's > difficult to run all the tests on POWER but 505-networking-pasta.bats > test suite passes. > --- > checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 107 insertions(+), 3 deletions(-) > > diff --git a/checksum.c b/checksum.c > index 0c3837c..828f9ec 100644 > --- a/checksum.c > +++ b/checksum.c > @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, > icmp6hr->icmp6_cksum = csum(payload, dlen, psum); > } > > -#ifdef __AVX2__ > +#if defined(__AVX2__) > #include <immintrin.h> > > /** > @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) > > return init; > } > -#else /* __AVX2__ */ > +#elif defined(__POWER9_VECTOR__) || defined(__POWER8_VECTOR__) > +#include <altivec.h> > + > +/** > + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions > + * @buf: Input buffer > + * @len: Input length > + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > + * > + * Return: 32-bit checksum, not complemented, not folded > + */ > +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ > +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ > +static uint32_t csum_vsx(const void *buf, size_t len, uint32_t init) > +{ > + const uint8_t *p = buf; > + vector unsigned int sum_even = vec_splat_u32(0); > + vector unsigned int sum_odd = vec_splat_u32(0); > + const vector unsigned short ones = vec_splat_u16(1); > + uint64_t sum64 = init; > + > +#ifdef __POWER9_VECTOR__ > + while (len >= 64) { > + vector unsigned char v0b = vec_vsx_ld(0, p); > + vector unsigned char v1b = vec_vsx_ld(16, p); > + vector unsigned char v2b = vec_vsx_ld(32, p); > + vector unsigned char v3b = vec_vsx_ld(48, p); > + vector unsigned short v0 = (vector unsigned short)v0b; > + vector unsigned short v1 = (vector unsigned short)v1b; > + vector unsigned short v2 = (vector unsigned short)v2b; > + vector unsigned short v3 = (vector unsigned short)v3b; > + > + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > + sum_even = vec_add(sum_even, vec_mule(v2, ones)); > + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); > + sum_even = vec_add(sum_even, vec_mule(v3, ones)); > + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); > + > + p += 64; > + len -= 64; > + } > +#endif > + > + while (len >= 32) { > + vector unsigned char v0b = vec_vsx_ld(0, p); > + vector unsigned char v1b = vec_vsx_ld(16, p); > + vector unsigned short v0 = (vector unsigned short)v0b; > + vector unsigned short v1 = (vector unsigned short)v1b; > + > + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > + > + p += 32; > + len -= 32; > + } > + > + while (len >= 16) { > + vector unsigned char v0b = vec_vsx_ld(0, p); > + vector unsigned short v0 = (vector unsigned short)v0b; > + > + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > + > + p += 16; > + len -= 16; > + } > + > + { > + vector unsigned int sum32 = vec_add(sum_even, sum_odd); > + uint32_t partial[4] __attribute__((aligned(16))); > + > + vec_st(sum32, 0, partial); > + sum64 += (uint64_t)partial[0] + partial[1] + > + partial[2] + partial[3]; > + } > + > + sum64 += sum_16b(p, len); > + > + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); > + sum64 += sum64 >> 32; > + > + return (uint32_t)sum64; > +} > + > +/** > + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > + * > + * @buf: Input buffer > + * @len: Input length > + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > + * > + * Return: 32-bit unfolded checksum > + */ > +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ > +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ > +uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) > +{ > + return csum_vsx(buf, len, init); > +} > +#else /* !__AVX2__ && !__POWER9_VECTOR__ && !__POWER8_VECTOR__ */ > /** > * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > * > @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) > { > return sum_16b(buf, len) + init; > } > -#endif /* !__AVX2__ */ > +#endif /* !__AVX2__ && !__POWER9_VECTOR__ && !__POWER8_VECTOR__ */ > > /** > * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector > -- > 2.52.0 Reviewed-by: Laurent Vivier <lvivier@redhat.com> ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: checksum: add VSX fast path for POWER8/POWER9 2026-02-06 15:17 ` Laurent Vivier @ 2026-02-07 22:31 ` jfiusdq 2026-02-10 13:36 ` Cédric Le Goater 0 siblings, 1 reply; 6+ messages in thread From: jfiusdq @ 2026-02-07 22:31 UTC (permalink / raw) To: Laurent Vivier; +Cc: passt-dev [-- Attachment #1.1: Type: text/plain, Size: 5643 bytes --] Microbenchmark of the checksum function vs C version at different buffer sizes: Results (GB/s, higher is better; speedup = VSX / scalar): 64B: VSX 4.61 vs scalar 5.91 -> 0.78x (VSX slower for tiny buffers) 256B: VSX 10.91 vs scalar 7.57 -> 1.44x 1500B: VSX 13.88 vs scalar 6.89 -> 2.02x 16KB: VSX 14.53 vs scalar 6.96 -> 2.09x 64KB: VSX 15.15 vs scalar 6.85 -> 2.21x On Friday, February 6th, 2026 at 3:17 PM, Laurent Vivier <lvivier@redhat.com> wrote: > > > On Thu, 05 Feb 2026 06:14:40 +0000, jfiusdq jfiusdq@proton.me wrote: > > > Tested with podman on Debian 13 for a while and works ok. It's > > difficult to run all the tests on POWER but 505-networking-pasta.bats > > test suite passes. > > --- > > checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > > 1 file changed, 107 insertions(+), 3 deletions(-) > > > > diff --git a/checksum.c b/checksum.c > > index 0c3837c..828f9ec 100644 > > --- a/checksum.c > > +++ b/checksum.c > > @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, > > icmp6hr->icmp6_cksum = csum(payload, dlen, psum); > > } > > > > -#ifdef AVX2 > > +#if defined(AVX2) > > #include <immintrin.h> > > > > /** > > @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) > > > > return init; > > } > > -#else /* AVX2 / > > +#elif defined(POWER9_VECTOR) || defined(POWER8_VECTOR) > > +#include <altivec.h> > > + > > +/* > > + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions > > + * @buf: Input buffer > > + * @len: Input length > > + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > > + * > > + * Return: 32-bit checksum, not complemented, not folded > > + / > > +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / > > +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() */ > > +static uint32_t csum_vsx(const void buf, size_t len, uint32_t init) > > +{ > > + const uint8_t p = buf; > > + vector unsigned int sum_even = vec_splat_u32(0); > > + vector unsigned int sum_odd = vec_splat_u32(0); > > + const vector unsigned short ones = vec_splat_u16(1); > > + uint64_t sum64 = init; > > + > > +#ifdef POWER9_VECTOR > > + while (len >= 64) { > > + vector unsigned char v0b = vec_vsx_ld(0, p); > > + vector unsigned char v1b = vec_vsx_ld(16, p); > > + vector unsigned char v2b = vec_vsx_ld(32, p); > > + vector unsigned char v3b = vec_vsx_ld(48, p); > > + vector unsigned short v0 = (vector unsigned short)v0b; > > + vector unsigned short v1 = (vector unsigned short)v1b; > > + vector unsigned short v2 = (vector unsigned short)v2b; > > + vector unsigned short v3 = (vector unsigned short)v3b; > > + > > + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > > + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > > + sum_even = vec_add(sum_even, vec_mule(v2, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); > > + sum_even = vec_add(sum_even, vec_mule(v3, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); > > + > > + p += 64; > > + len -= 64; > > + } > > +#endif > > + > > + while (len >= 32) { > > + vector unsigned char v0b = vec_vsx_ld(0, p); > > + vector unsigned char v1b = vec_vsx_ld(16, p); > > + vector unsigned short v0 = (vector unsigned short)v0b; > > + vector unsigned short v1 = (vector unsigned short)v1b; > > + > > + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > > + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > > + > > + p += 32; > > + len -= 32; > > + } > > + > > + while (len >= 16) { > > + vector unsigned char v0b = vec_vsx_ld(0, p); > > + vector unsigned short v0 = (vector unsigned short)v0b; > > + > > + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > > + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > > + > > + p += 16; > > + len -= 16; > > + } > > + > > + { > > + vector unsigned int sum32 = vec_add(sum_even, sum_odd); > > + uint32_t partial[4] attribute((aligned(16))); > > + > > + vec_st(sum32, 0, partial); > > + sum64 += (uint64_t)partial[0] + partial[1] + > > + partial[2] + partial[3]; > > + } > > + > > + sum64 += sum_16b(p, len); > > + > > + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); > > + sum64 += sum64 >> 32; > > + > > + return (uint32_t)sum64; > > +} > > + > > +/ > > + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > > + * > > + * @buf: Input buffer > > + * @len: Input length > > + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > > + * > > + * Return: 32-bit unfolded checksum > > + / > > +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / > > +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() / > > +uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) > > +{ > > + return csum_vsx(buf, len, init); > > +} > > +#else / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR / > > / > > * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > > * > > @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) > > { > > return sum_16b(buf, len) + init; > > } > > -#endif / !AVX2 / > > +#endif / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR */ > > > > /** > > * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector > > -- > > 2.52.0 > > > Reviewed-by: Laurent Vivier lvivier@redhat.com [-- Attachment #1.2: publickey - jfiusdq@proton.me - 0x344F580A.asc --] [-- Type: application/pgp-keys, Size: 832 bytes --] [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 343 bytes --] ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: checksum: add VSX fast path for POWER8/POWER9 2026-02-07 22:31 ` jfiusdq @ 2026-02-10 13:36 ` Cédric Le Goater 2026-02-11 11:55 ` jfiusdq 0 siblings, 1 reply; 6+ messages in thread From: Cédric Le Goater @ 2026-02-10 13:36 UTC (permalink / raw) To: jfiusdq, Laurent Vivier; +Cc: passt-dev Hi, On 2/7/26 23:31, jfiusdq wrote: > Microbenchmark of the checksum function vs C version at different buffer sizes: > > > Results (GB/s, higher is better; speedup = VSX / scalar): > > 64B: VSX 4.61 vs scalar 5.91 -> 0.78x (VSX slower for tiny buffers) > 256B: VSX 10.91 vs scalar 7.57 -> 1.44x > 1500B: VSX 13.88 vs scalar 6.89 -> 2.02x > 16KB: VSX 14.53 vs scalar 6.96 -> 2.09x > 64KB: VSX 15.15 vs scalar 6.85 -> 2.21x Could you please share Microbenchmark ? Thanks, C. > On Friday, February 6th, 2026 at 3:17 PM, Laurent Vivier <lvivier@redhat.com> wrote: > >> > >> > >> On Thu, 05 Feb 2026 06:14:40 +0000, jfiusdq jfiusdq@proton.me wrote: >> > >>> Tested with podman on Debian 13 for a while and works ok. It's >>> difficult to run all the tests on POWER but 505-networking-pasta.bats >>> test suite passes. >>> --- >>> checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-- >>> 1 file changed, 107 insertions(+), 3 deletions(-) >>> > >>> diff --git a/checksum.c b/checksum.c >>> index 0c3837c..828f9ec 100644 >>> --- a/checksum.c >>> +++ b/checksum.c >>> @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, >>> icmp6hr->icmp6_cksum = csum(payload, dlen, psum); >>> } >>> > >>> -#ifdef AVX2 >>> +#if defined(AVX2) >>> #include <immintrin.h> >>> > >>> /** >>> @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) >>> > >>> return init; >>> } >>> -#else /* AVX2 / >>> +#elif defined(POWER9_VECTOR) || defined(POWER8_VECTOR) >>> +#include <altivec.h> >>> + >>> +/* >>> + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions >>> + * @buf: Input buffer >>> + * @len: Input length >>> + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum >>> + * >>> + * Return: 32-bit checksum, not complemented, not folded >>> + / >>> +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / >>> +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() */ >>> +static uint32_t csum_vsx(const void buf, size_t len, uint32_t init) >>> +{ >>> + const uint8_t p = buf; >>> + vector unsigned int sum_even = vec_splat_u32(0); >>> + vector unsigned int sum_odd = vec_splat_u32(0); >>> + const vector unsigned short ones = vec_splat_u16(1); >>> + uint64_t sum64 = init; >>> + >>> +#ifdef POWER9_VECTOR >>> + while (len >= 64) { >>> + vector unsigned char v0b = vec_vsx_ld(0, p); >>> + vector unsigned char v1b = vec_vsx_ld(16, p); >>> + vector unsigned char v2b = vec_vsx_ld(32, p); >>> + vector unsigned char v3b = vec_vsx_ld(48, p); >>> + vector unsigned short v0 = (vector unsigned short)v0b; >>> + vector unsigned short v1 = (vector unsigned short)v1b; >>> + vector unsigned short v2 = (vector unsigned short)v2b; >>> + vector unsigned short v3 = (vector unsigned short)v3b; >>> + >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones)); >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); >>> + sum_even = vec_add(sum_even, vec_mule(v1, ones)); >>> + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); >>> + sum_even = vec_add(sum_even, vec_mule(v2, ones)); >>> + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); >>> + sum_even = vec_add(sum_even, vec_mule(v3, ones)); >>> + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); >>> + >>> + p += 64; >>> + len -= 64; >>> + } >>> +#endif >>> + >>> + while (len >= 32) { >>> + vector unsigned char v0b = vec_vsx_ld(0, p); >>> + vector unsigned char v1b = vec_vsx_ld(16, p); >>> + vector unsigned short v0 = (vector unsigned short)v0b; >>> + vector unsigned short v1 = (vector unsigned short)v1b; >>> + >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones)); >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); >>> + sum_even = vec_add(sum_even, vec_mule(v1, ones)); >>> + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); >>> + >>> + p += 32; >>> + len -= 32; >>> + } >>> + >>> + while (len >= 16) { >>> + vector unsigned char v0b = vec_vsx_ld(0, p); >>> + vector unsigned short v0 = (vector unsigned short)v0b; >>> + >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones)); >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); >>> + >>> + p += 16; >>> + len -= 16; >>> + } >>> + >>> + { >>> + vector unsigned int sum32 = vec_add(sum_even, sum_odd); >>> + uint32_t partial[4] attribute((aligned(16))); >>> + >>> + vec_st(sum32, 0, partial); >>> + sum64 += (uint64_t)partial[0] + partial[1] + >>> + partial[2] + partial[3]; >>> + } >>> + >>> + sum64 += sum_16b(p, len); >>> + >>> + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); >>> + sum64 += sum64 >> 32; >>> + >>> + return (uint32_t)sum64; >>> +} >>> + >>> +/ >>> + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. >>> + * >>> + * @buf: Input buffer >>> + * @len: Input length >>> + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum >>> + * >>> + * Return: 32-bit unfolded checksum >>> + / >>> +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / >>> +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() / >>> +uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) >>> +{ >>> + return csum_vsx(buf, len, init); >>> +} >>> +#else / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR / >>> / >>> * csum_unfolded() - Calculate the unfolded checksum of a data buffer. >>> * >>> @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) >>> { >>> return sum_16b(buf, len) + init; >>> } >>> -#endif / !AVX2 / >>> +#endif / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR */ >>> > >>> /** >>> * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector >>> -- >>> 2.52.0 >> > >> > >> Reviewed-by: Laurent Vivier lvivier@redhat.com ^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: checksum: add VSX fast path for POWER8/POWER9 2026-02-10 13:36 ` Cédric Le Goater @ 2026-02-11 11:55 ` jfiusdq 0 siblings, 0 replies; 6+ messages in thread From: jfiusdq @ 2026-02-11 11:55 UTC (permalink / raw) To: Cédric Le Goater; +Cc: Laurent Vivier, passt-dev [-- Attachment #1.1: Type: text/plain, Size: 8085 bytes --] // bench_checksum.c // Run from the source directory: // gcc -O3 -D_GNU_SOURCE -mcpu=power9 -mvsx -maltivec \ // -I . -o /tmp/bench_vsx bench_checksum.c checksum.c // // gcc -O3 -D_GNU_SOURCE -mcpu=power9 -mno-vsx -mno-altivec -fno-tree-vectorize \ // -U__POWER8_VECTOR__ -U__POWER9_VECTOR__ \ // -I . -o /tmp/bench_scalar bench_checksum.c checksum.c #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <time.h> #include "checksum.h" static double now_sec(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (double)ts.tv_sec + (double)ts.tv_nsec * 1e-9; } int main(void) { static const size_t sizes[] = { 64, 256, 1500, 16 * 1024, 64 * 1024 }; const size_t target_bytes = 512UL * 1024UL * 1024UL; /* 512 MiB per size */ uint8_t *buf = NULL; size_t i; volatile uint32_t sink = 0; if (posix_memalign((void **)&buf, 64, sizes[sizeof(sizes) / sizeof(sizes[0]) - 1])) { perror("posix_memalign"); return 1; } for (i = 0; i < sizes[sizeof(sizes) / sizeof(sizes[0]) - 1]; i++) buf[i] = (uint8_t)(i * 131u + 7u); printf("len, iters, seconds, GBps, checksum\n"); for (i = 0; i < sizeof(sizes) / sizeof(sizes[0]); i++) { size_t len = sizes[i]; size_t iters = target_bytes / len; size_t warm = iters / 10; double start, end, secs, gbps; if (iters < 10000) iters = 10000; if (warm < 1000) warm = 1000; for (size_t j = 0; j < warm; j++) sink ^= csum_unfolded(buf, len, 0); start = now_sec(); for (size_t j = 0; j < iters; j++) sink ^= csum_unfolded(buf, len, 0); end = now_sec(); secs = end - start; gbps = ((double)len * (double)iters) / (secs * 1e9); printf("%zu, %zu, %.6f, %.3f, 0x%08x\n", len, iters, secs, gbps, sink); } free((void *)buf); return 0; } On Tuesday, February 10th, 2026 at 1:37 PM, Cédric Le Goater <clg@kaod.org> wrote: > Hi, > > On 2/7/26 23:31, jfiusdq wrote: > > Microbenchmark of the checksum function vs C version at different buffer sizes: > > > > > > Results (GB/s, higher is better; speedup = VSX / scalar): > > > > 64B: VSX 4.61 vs scalar 5.91 -> 0.78x (VSX slower for tiny buffers) > > 256B: VSX 10.91 vs scalar 7.57 -> 1.44x > > 1500B: VSX 13.88 vs scalar 6.89 -> 2.02x > > 16KB: VSX 14.53 vs scalar 6.96 -> 2.09x > > 64KB: VSX 15.15 vs scalar 6.85 -> 2.21x > > Could you please share Microbenchmark ? > > Thanks, > > C. > > > > On Friday, February 6th, 2026 at 3:17 PM, Laurent Vivier <lvivier@redhat.com> wrote: > > > >> > > > >> > > > >> On Thu, 05 Feb 2026 06:14:40 +0000, jfiusdq jfiusdq@proton.me wrote: > >> > > > >>> Tested with podman on Debian 13 for a while and works ok. It's > >>> difficult to run all the tests on POWER but 505-networking-pasta.bats > >>> test suite passes. > >>> --- > >>> checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-- > >>> 1 file changed, 107 insertions(+), 3 deletions(-) > >>> > > > >>> diff --git a/checksum.c b/checksum.c > >>> index 0c3837c..828f9ec 100644 > >>> --- a/checksum.c > >>> +++ b/checksum.c > >>> @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, > >>> icmp6hr->icmp6_cksum = csum(payload, dlen, psum); > >>> } > >>> > > > >>> -#ifdef AVX2 > >>> +#if defined(AVX2) > >>> #include <immintrin.h> > >>> > > > >>> /** > >>> @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) > >>> > > > >>> return init; > >>> } > >>> -#else /* AVX2 / > >>> +#elif defined(POWER9_VECTOR) || defined(POWER8_VECTOR) > >>> +#include <altivec.h> > >>> + > >>> +/* > >>> + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions > >>> + * @buf: Input buffer > >>> + * @len: Input length > >>> + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > >>> + * > >>> + * Return: 32-bit checksum, not complemented, not folded > >>> + / > >>> +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / > >>> +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() */ > >>> +static uint32_t csum_vsx(const void buf, size_t len, uint32_t init) > >>> +{ > >>> + const uint8_t p = buf; > >>> + vector unsigned int sum_even = vec_splat_u32(0); > >>> + vector unsigned int sum_odd = vec_splat_u32(0); > >>> + const vector unsigned short ones = vec_splat_u16(1); > >>> + uint64_t sum64 = init; > >>> + > >>> +#ifdef POWER9_VECTOR > >>> + while (len >= 64) { > >>> + vector unsigned char v0b = vec_vsx_ld(0, p); > >>> + vector unsigned char v1b = vec_vsx_ld(16, p); > >>> + vector unsigned char v2b = vec_vsx_ld(32, p); > >>> + vector unsigned char v3b = vec_vsx_ld(48, p); > >>> + vector unsigned short v0 = (vector unsigned short)v0b; > >>> + vector unsigned short v1 = (vector unsigned short)v1b; > >>> + vector unsigned short v2 = (vector unsigned short)v2b; > >>> + vector unsigned short v3 = (vector unsigned short)v3b; > >>> + > >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > >>> + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > >>> + sum_even = vec_add(sum_even, vec_mule(v2, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); > >>> + sum_even = vec_add(sum_even, vec_mule(v3, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); > >>> + > >>> + p += 64; > >>> + len -= 64; > >>> + } > >>> +#endif > >>> + > >>> + while (len >= 32) { > >>> + vector unsigned char v0b = vec_vsx_ld(0, p); > >>> + vector unsigned char v1b = vec_vsx_ld(16, p); > >>> + vector unsigned short v0 = (vector unsigned short)v0b; > >>> + vector unsigned short v1 = (vector unsigned short)v1b; > >>> + > >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > >>> + sum_even = vec_add(sum_even, vec_mule(v1, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); > >>> + > >>> + p += 32; > >>> + len -= 32; > >>> + } > >>> + > >>> + while (len >= 16) { > >>> + vector unsigned char v0b = vec_vsx_ld(0, p); > >>> + vector unsigned short v0 = (vector unsigned short)v0b; > >>> + > >>> + sum_even = vec_add(sum_even, vec_mule(v0, ones)); > >>> + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); > >>> + > >>> + p += 16; > >>> + len -= 16; > >>> + } > >>> + > >>> + { > >>> + vector unsigned int sum32 = vec_add(sum_even, sum_odd); > >>> + uint32_t partial[4] attribute((aligned(16))); > >>> + > >>> + vec_st(sum32, 0, partial); > >>> + sum64 += (uint64_t)partial[0] + partial[1] + > >>> + partial[2] + partial[3]; > >>> + } > >>> + > >>> + sum64 += sum_16b(p, len); > >>> + > >>> + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); > >>> + sum64 += sum64 >> 32; > >>> + > >>> + return (uint32_t)sum64; > >>> +} > >>> + > >>> +/ > >>> + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > >>> + * > >>> + * @buf: Input buffer > >>> + * @len: Input length > >>> + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum > >>> + * > >>> + * Return: 32-bit unfolded checksum > >>> + / > >>> +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / > >>> +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() / > >>> +uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) > >>> +{ > >>> + return csum_vsx(buf, len, init); > >>> +} > >>> +#else / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR / > >>> / > >>> * csum_unfolded() - Calculate the unfolded checksum of a data buffer. > >>> * > >>> @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) > >>> { > >>> return sum_16b(buf, len) + init; > >>> } > >>> -#endif / !AVX2 / > >>> +#endif / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR */ > >>> > > > >>> /** > >>> * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector > >>> -- > >>> 2.52.0 > >> > > > >> > > > >> Reviewed-by: Laurent Vivier lvivier@redhat.com > > [-- Attachment #1.2: publickey - jfiusdq@proton.me - 0x344F580A.asc --] [-- Type: application/pgp-keys, Size: 832 bytes --] [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 343 bytes --] ^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2026-02-11 11:55 UTC | newest] Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2026-02-05 6:14 checksum: add VSX fast path for POWER8/POWER9 jfiusdq 2026-02-05 18:03 ` Stefano Brivio 2026-02-06 15:17 ` Laurent Vivier 2026-02-07 22:31 ` jfiusdq 2026-02-10 13:36 ` Cédric Le Goater 2026-02-11 11:55 ` jfiusdq
Code repositories for project(s) associated with this public inbox https://passt.top/passt This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for IMAP folder(s).