Tested with podman on Debian 13 for a while and works ok. It's difficult to run all the tests on POWER but 505-networking-pasta.bats test suite passes.
Hi jfiusdq,
On Thu, 05 Feb 2026 06:14:40 +0000
jfiusdq
Tested with podman on Debian 13 for a while and works ok. It's difficult to run all the tests on POWER but 505-networking-pasta.bats test suite passes.
Thanks for the patch! I'm not really familiar with AltiVec / VSX or POWER at all so it's difficult for me to review this, but we have somebody on the list who should be able to help. :) It might need a bit of time though. Meanwhile, it would be nice if you could send this patch in the usual format, using git send-email and adding a Signed-off-by: tag. This is just the same submission format as the Linux kernel and many other opensource projects, see the archives for examples: https://archives.passt.top/passt-dev/ I understand you might not want to reveal your full name, and that's entirely fine, but still it would be better if you could send the patch in the usual format. We'll accept patches regardless of the submission format though, so that's not a strict requirement, just a nice-to-have. Thanks. -- Stefano
On Thu, 05 Feb 2026 06:14:40 +0000, jfiusdq
Tested with podman on Debian 13 for a while and works ok. It's difficult to run all the tests on POWER but 505-networking-pasta.bats test suite passes. --- checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 3 deletions(-)
diff --git a/checksum.c b/checksum.c index 0c3837c..828f9ec 100644 --- a/checksum.c +++ b/checksum.c @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, icmp6hr->icmp6_cksum = csum(payload, dlen, psum); }
-#ifdef __AVX2__ +#if defined(__AVX2__) #include
/** @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
return init; } -#else /* __AVX2__ */ +#elif defined(__POWER9_VECTOR__) || defined(__POWER8_VECTOR__) +#include
+ +/** + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit checksum, not complemented, not folded + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ +static uint32_t csum_vsx(const void *buf, size_t len, uint32_t init) +{ + const uint8_t *p = buf; + vector unsigned int sum_even = vec_splat_u32(0); + vector unsigned int sum_odd = vec_splat_u32(0); + const vector unsigned short ones = vec_splat_u16(1); + uint64_t sum64 = init; + +#ifdef __POWER9_VECTOR__ + while (len >= 64) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned char v1b = vec_vsx_ld(16, p); + vector unsigned char v2b = vec_vsx_ld(32, p); + vector unsigned char v3b = vec_vsx_ld(48, p); + vector unsigned short v0 = (vector unsigned short)v0b; + vector unsigned short v1 = (vector unsigned short)v1b; + vector unsigned short v2 = (vector unsigned short)v2b; + vector unsigned short v3 = (vector unsigned short)v3b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + sum_even = vec_add(sum_even, vec_mule(v1, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); + sum_even = vec_add(sum_even, vec_mule(v2, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); + sum_even = vec_add(sum_even, vec_mule(v3, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); + + p += 64; + len -= 64; + } +#endif + + while (len >= 32) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned char v1b = vec_vsx_ld(16, p); + vector unsigned short v0 = (vector unsigned short)v0b; + vector unsigned short v1 = (vector unsigned short)v1b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + sum_even = vec_add(sum_even, vec_mule(v1, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); + + p += 32; + len -= 32; + } + + while (len >= 16) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned short v0 = (vector unsigned short)v0b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + + p += 16; + len -= 16; + } + + { + vector unsigned int sum32 = vec_add(sum_even, sum_odd); + uint32_t partial[4] __attribute__((aligned(16))); + + vec_st(sum32, 0, partial); + sum64 += (uint64_t)partial[0] + partial[1] + + partial[2] + partial[3]; + } + + sum64 += sum_16b(p, len); + + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); + sum64 += sum64 >> 32; + + return (uint32_t)sum64; +} + +/** + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. + * + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit unfolded checksum + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See csum_16b() */ +uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) +{ + return csum_vsx(buf, len, init); +} +#else /* !__AVX2__ && !__POWER9_VECTOR__ && !__POWER8_VECTOR__ */ /** * csum_unfolded() - Calculate the unfolded checksum of a data buffer. * @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init) { return sum_16b(buf, len) + init; } -#endif /* !__AVX2__ */ +#endif /* !__AVX2__ && !__POWER9_VECTOR__ && !__POWER8_VECTOR__ */ /** * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector -- 2.52.0
Reviewed-by: Laurent Vivier
Microbenchmark of the checksum function vs C version at different buffer sizes:
Results (GB/s, higher is better; speedup = VSX / scalar):
64B: VSX 4.61 vs scalar 5.91 -> 0.78x (VSX slower for tiny buffers)
256B: VSX 10.91 vs scalar 7.57 -> 1.44x
1500B: VSX 13.88 vs scalar 6.89 -> 2.02x
16KB: VSX 14.53 vs scalar 6.96 -> 2.09x
64KB: VSX 15.15 vs scalar 6.85 -> 2.21x
On Friday, February 6th, 2026 at 3:17 PM, Laurent Vivier
On Thu, 05 Feb 2026 06:14:40 +0000, jfiusdq jfiusdq@proton.me wrote:
Tested with podman on Debian 13 for a while and works ok. It's difficult to run all the tests on POWER but 505-networking-pasta.bats test suite passes. --- checksum.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 3 deletions(-)
diff --git a/checksum.c b/checksum.c index 0c3837c..828f9ec 100644 --- a/checksum.c +++ b/checksum.c @@ -281,7 +281,7 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, icmp6hr->icmp6_cksum = csum(payload, dlen, psum); }
-#ifdef AVX2 +#if defined(AVX2) #include
/** @@ -479,7 +479,111 @@ uint32_t csum_unfolded(const void *buf, size_t len, uint32_t init)
return init; } -#else /* AVX2 / +#elif defined(POWER9_VECTOR) || defined(POWER8_VECTOR) +#include
+ +/* + * csum_vsx() - Compute 32-bit checksum using VSX SIMD instructions + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit checksum, not complemented, not folded + / +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() */ +static uint32_t csum_vsx(const void buf, size_t len, uint32_t init) +{ + const uint8_t p = buf; + vector unsigned int sum_even = vec_splat_u32(0); + vector unsigned int sum_odd = vec_splat_u32(0); + const vector unsigned short ones = vec_splat_u16(1); + uint64_t sum64 = init; + +#ifdef POWER9_VECTOR + while (len >= 64) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned char v1b = vec_vsx_ld(16, p); + vector unsigned char v2b = vec_vsx_ld(32, p); + vector unsigned char v3b = vec_vsx_ld(48, p); + vector unsigned short v0 = (vector unsigned short)v0b; + vector unsigned short v1 = (vector unsigned short)v1b; + vector unsigned short v2 = (vector unsigned short)v2b; + vector unsigned short v3 = (vector unsigned short)v3b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + sum_even = vec_add(sum_even, vec_mule(v1, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); + sum_even = vec_add(sum_even, vec_mule(v2, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v2, ones)); + sum_even = vec_add(sum_even, vec_mule(v3, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v3, ones)); + + p += 64; + len -= 64; + } +#endif + + while (len >= 32) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned char v1b = vec_vsx_ld(16, p); + vector unsigned short v0 = (vector unsigned short)v0b; + vector unsigned short v1 = (vector unsigned short)v1b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + sum_even = vec_add(sum_even, vec_mule(v1, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v1, ones)); + + p += 32; + len -= 32; + } + + while (len >= 16) { + vector unsigned char v0b = vec_vsx_ld(0, p); + vector unsigned short v0 = (vector unsigned short)v0b; + + sum_even = vec_add(sum_even, vec_mule(v0, ones)); + sum_odd = vec_add(sum_odd, vec_mulo(v0, ones)); + + p += 16; + len -= 16; + } + + { + vector unsigned int sum32 = vec_add(sum_even, sum_odd); + uint32_t partial[4] attribute((aligned(16))); + + vec_st(sum32, 0, partial); + sum64 += (uint64_t)partial[0] + partial[1] + + partial[2] + partial[3]; + } + + sum64 += sum_16b(p, len); + + sum64 = (sum64 >> 32) + (sum64 & 0xffffffff); + sum64 += sum64 >> 32; + + return (uint32_t)sum64; +} + +/ + * csum_unfolded() - Calculate the unfolded checksum of a data buffer. + * + * @buf: Input buffer + * @len: Input length + * @init: Initial 32-bit checksum, 0 for no pre-computed checksum + * + * Return: 32-bit unfolded checksum + / +/ NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) / +attribute((optimize("-fno-strict-aliasing"))) / See csum_16b() / +uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) +{ + return csum_vsx(buf, len, init); +} +#else / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR / / * csum_unfolded() - Calculate the unfolded checksum of a data buffer. * @@ -495,7 +599,7 @@ uint32_t csum_unfolded(const void buf, size_t len, uint32_t init) { return sum_16b(buf, len) + init; } -#endif / !AVX2 / +#endif / !AVX2 && !POWER9_VECTOR && !POWER8_VECTOR */
/** * csum_iov_tail() - Calculate unfolded checksum for the tail of an IO vector -- 2.52.0
Reviewed-by: Laurent Vivier lvivier@redhat.com
participants (3)
-
jfiusdq
-
Laurent Vivier
-
Stefano Brivio