Recently, commit 4ddbcb9c0c55 ("tcp: Disable optimisations for tcp_hash()") worked around yet another issue we hit with gcc 12 and '-flto -O2': some stores affecting the input data to siphash_20b() were omitted altogether, and tcp_hash() wouldn't get the correct hash for incoming connections. Digging further into this revealed that, at least according to gcc's interpretation of C99 aliasing rules, passing pointers to functions with different types compared to the effective type of the object (for example, a uint8_t pointer to an anonymous struct, as it happens in tcp_hash()), doesn't guarantee that stores are not reordered across the function call. This means that, in general, our checksum and hash functions might not see parts of input data that was intended to be provided by callers. Not even switching from uint8_t to character types, which should be appropriate here, according to C99 (ISO/IEC 9899, TC3, draft N1256), section 6.5, "Expressions", paragraph 7: An object shall have its stored value accessed only by an lvalue expression that has one of the following types: [...] — a character type. does the trick. I guess this is also subject to interpretation: casting passed pointers to character types, and then using those as different types, might still violate (dubious) aliasing rules. Disable gcc strict aliasing rules for potentially affected functions, which, in turn, disables gcc's Type-Based Alias Analysis (TBAA) optimisations based on those function arguments. Drop the existing workarounds. Also the (seemingly?) bogus 'maybe-uninitialized' warning on the tcp_tap_handler() > tcp_hash() > siphash_20b() path goes away with -fno-strict-aliasing on siphash_20b(). Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com> --- Makefile | 21 --------------------- checksum.c | 13 ++++++++++--- siphash.c | 22 +++++++++++++++++++--- tcp.c | 6 ------ 4 files changed, 29 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 080c748..667ddfb 100644 --- a/Makefile +++ b/Makefile @@ -56,27 +56,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h icmp.h \ tcp_splice.h udp.h util.h HEADERS = $(PASST_HEADERS) seccomp.h -# On gcc 11 and 12, with -O2 and -flto, tcp_hash() and siphash_20b(), if -# inlined, seem to be hitting something similar to: -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78993 -# from the pointer arithmetic used from the tcp_tap_handler() path to get the -# remote connection address. -# -# TODO: With the same combination, in ndp(), gcc optimises away the store of -# hop_limit in the IPv6 header (temporarily set to the protocol number for -# convenience, to mimic the ICMPv6 checksum pseudo-header) before the call to -# csum_unaligned(). Mark csum_unaligned() as "noipa" as a quick work-around, -# while we figure out if a corresponding gcc issue has already been reported. -ifeq (,$(filter-out 11 12, $(shell $(CC) -dumpversion))) -ifneq (,$(filter -flto%,$(FLAGS) $(CFLAGS) $(CPPFLAGS))) -ifneq (,$(filter -O2,$(FLAGS) $(CFLAGS) $(CPPFLAGS))) - FLAGS += -DTCP_HASH_NOINLINE - FLAGS += -DSIPHASH_20B_NOINLINE - FLAGS += -DCSUM_UNALIGNED_NO_IPA -endif -endif -endif - C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) FLAGS += -DHAS_SND_WND diff --git a/checksum.c b/checksum.c index 29769d9..9631f91 100644 --- a/checksum.c +++ b/checksum.c @@ -69,6 +69,8 @@ * * Return: 32-bit sum of 16-bit words */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint32_t sum_16b(const void *buf, size_t len) { const uint16_t *p = buf; @@ -107,9 +109,8 @@ uint16_t csum_fold(uint32_t sum) * * Return: 16-bit IPv4-style checksum */ -#if CSUM_UNALIGNED_NO_IPA -__attribute__((__noipa__)) /* See comment in Makefile */ -#endif +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init) { return (uint16_t)~csum_fold(sum_16b(buf, len) + init); @@ -245,6 +246,8 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, * - sum_a/sum_b unpacking is interleaved and not sequential to reduce stalls * - coding style adaptation */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init) { __m256i a, b, sum256, sum_a_hi, sum_a_lo, sum_b_hi, sum_b_lo, c, d; @@ -391,6 +394,8 @@ less_than_128_bytes: * * Return: 16-bit folded, complemented checksum sum */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint16_t csum(const void *buf, size_t len, uint32_t init) { return (uint16_t)~csum_fold(csum_avx2(buf, len, init)); @@ -406,6 +411,8 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint16_t csum(const void *buf, size_t len, uint32_t init) { return csum_unaligned(buf, len, init); diff --git a/siphash.c b/siphash.c index 811918b..e8b144d 100644 --- a/siphash.c +++ b/siphash.c @@ -104,6 +104,17 @@ * * Return: the 64-bit hash output */ +/* Type-Based Alias Analysis (TBAA) optimisation in gcc 11 and 12 (-flto -O2) + * makes these functions essentially useless by allowing reordering of stores of + * input data across function calls. Not even declaring @in as char pointer is + * enough: disable gcc's interpretation of strict aliasing altogether. See also: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106706 + * https://stackoverflow.com/questions/2958633/gcc-strict-aliasing-and-horror-… + * https://lore.kernel.org/all/alpine.LFD.2.00.0901121128080.6528__33422.53280… + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* cppcheck-suppress unusedFunction */ uint64_t siphash_8b(const uint8_t *in, const uint64_t *k) { @@ -123,6 +134,8 @@ uint64_t siphash_8b(const uint8_t *in, const uint64_t *k) * * Return: 32 bits obtained by XORing the two halves of the 64-bit hash output */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ /* cppcheck-suppress unusedFunction */ uint32_t siphash_12b(const uint8_t *in, const uint64_t *k) { @@ -148,9 +161,8 @@ uint32_t siphash_12b(const uint8_t *in, const uint64_t *k) * * Return: the 64-bit hash output */ -#if SIPHASH_20B_NOINLINE -__attribute__((__noinline__)) /* See comment in Makefile */ -#endif +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint64_t siphash_20b(const uint8_t *in, const uint64_t *k) { uint32_t *in32 = (uint32_t *)in; @@ -179,6 +191,8 @@ uint64_t siphash_20b(const uint8_t *in, const uint64_t *k) * * Return: the 64-bit hash output */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ /* cppcheck-suppress unusedFunction */ uint32_t siphash_32b(const uint8_t *in, const uint64_t *k) { @@ -205,6 +219,8 @@ uint32_t siphash_32b(const uint8_t *in, const uint64_t *k) * * Return: 32 bits obtained by XORing the two halves of the 64-bit hash output */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint32_t siphash_36b(const uint8_t *in, const uint64_t *k) { uint32_t *in32 = (uint32_t *)in; diff --git a/tcp.c b/tcp.c index 21c319d..cbd537e 100644 --- a/tcp.c +++ b/tcp.c @@ -1182,12 +1182,6 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn, * * Return: hash value, already modulo size of the hash table */ -#if TCP_HASH_NOINLINE -__attribute__((__noinline__)) /* See comment in Makefile */ -#endif -__attribute__((optimize("O0"))) /* TODO: with -O2 and -flto on gcc 12.2, - * siphash_20b() doesn't see 'addr', why? - */ static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *addr, in_port_t tap_port, in_port_t sock_port) { -- 2.39.1
On Mon, Feb 27, 2023 at 10:59:29AM +0100, Stefano Brivio wrote:Recently, commit 4ddbcb9c0c55 ("tcp: Disable optimisations for tcp_hash()") worked around yet another issue we hit with gcc 12 and '-flto -O2': some stores affecting the input data to siphash_20b() were omitted altogether, and tcp_hash() wouldn't get the correct hash for incoming connections. Digging further into this revealed that, at least according to gcc's interpretation of C99 aliasing rules, passing pointers to functions with different types compared to the effective type of the object (for example, a uint8_t pointer to an anonymous struct, as it happens in tcp_hash()), doesn't guarantee that stores are not reordered across the function call.This means that, in general, our checksum and hash functions might not see parts of input data that was intended to be provided by callers. Not even switching from uint8_t to character types, which should be appropriate here, according to C99 (ISO/IEC 9899, TC3, draft N1256), section 6.5, "Expressions", paragraph 7: An object shall have its stored value accessed only by an lvalue expression that has one of the following types: [...] — a character type.Huh, weird. I certainly thought char/uint8_t pointers were explicitly allowed to be aliased with anything, or an enormous number of C-isms can't be counted on.does the trick. I guess this is also subject to interpretation: casting passed pointers to character types, and then using those as different types, might still violate (dubious) aliasing rules.Well, at least we know why now.Disable gcc strict aliasing rules for potentially affected functions, which, in turn, disables gcc's Type-Based Alias Analysis (TBAA) optimisations based on those function arguments.Excellent, I like this workaround much better than the previous one. Who knows exactly what "-O0" mean, here it's much more explicit what we're saying. I'd still prefer to make the code strict-aliasing correct if we can, but if going via a (char *) isn't doing it, then I really don't know how at this point.Drop the existing workarounds. Also the (seemingly?) bogus 'maybe-uninitialized' warning on the tcp_tap_handler() > tcp_hash() > siphash_20b() path goes away with -fno-strict-aliasing on siphash_20b(). Signed-off-by: Stefano Brivio <sbrivio(a)redhat.com>Reviewed-by: David Gibson <david(a)gibson.dropbear.id.au>--- Makefile | 21 --------------------- checksum.c | 13 ++++++++++--- siphash.c | 22 +++++++++++++++++++--- tcp.c | 6 ------ 4 files changed, 29 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 080c748..667ddfb 100644 --- a/Makefile +++ b/Makefile @@ -56,27 +56,6 @@ PASST_HEADERS = arch.h arp.h checksum.h conf.h dhcp.h dhcpv6.h icmp.h \ tcp_splice.h udp.h util.h HEADERS = $(PASST_HEADERS) seccomp.h -# On gcc 11 and 12, with -O2 and -flto, tcp_hash() and siphash_20b(), if -# inlined, seem to be hitting something similar to: -# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78993 -# from the pointer arithmetic used from the tcp_tap_handler() path to get the -# remote connection address. -# -# TODO: With the same combination, in ndp(), gcc optimises away the store of -# hop_limit in the IPv6 header (temporarily set to the protocol number for -# convenience, to mimic the ICMPv6 checksum pseudo-header) before the call to -# csum_unaligned(). Mark csum_unaligned() as "noipa" as a quick work-around, -# while we figure out if a corresponding gcc issue has already been reported. -ifeq (,$(filter-out 11 12, $(shell $(CC) -dumpversion))) -ifneq (,$(filter -flto%,$(FLAGS) $(CFLAGS) $(CPPFLAGS))) -ifneq (,$(filter -O2,$(FLAGS) $(CFLAGS) $(CPPFLAGS))) - FLAGS += -DTCP_HASH_NOINLINE - FLAGS += -DSIPHASH_20B_NOINLINE - FLAGS += -DCSUM_UNALIGNED_NO_IPA -endif -endif -endif - C := \#include <linux/tcp.h>\nstruct tcp_info x = { .tcpi_snd_wnd = 0 }; ifeq ($(shell printf "$(C)" | $(CC) -S -xc - -o - >/dev/null 2>&1; echo $$?),0) FLAGS += -DHAS_SND_WND diff --git a/checksum.c b/checksum.c index 29769d9..9631f91 100644 --- a/checksum.c +++ b/checksum.c @@ -69,6 +69,8 @@ * * Return: 32-bit sum of 16-bit words */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint32_t sum_16b(const void *buf, size_t len) { const uint16_t *p = buf; @@ -107,9 +109,8 @@ uint16_t csum_fold(uint32_t sum) * * Return: 16-bit IPv4-style checksum */ -#if CSUM_UNALIGNED_NO_IPA -__attribute__((__noipa__)) /* See comment in Makefile */ -#endif +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint16_t csum_unaligned(const void *buf, size_t len, uint32_t init) { return (uint16_t)~csum_fold(sum_16b(buf, len) + init); @@ -245,6 +246,8 @@ void csum_icmp6(struct icmp6hdr *icmp6hr, * - sum_a/sum_b unpacking is interleaved and not sequential to reduce stalls * - coding style adaptation */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ static uint32_t csum_avx2(const void *buf, size_t len, uint32_t init) { __m256i a, b, sum256, sum_a_hi, sum_a_lo, sum_b_hi, sum_b_lo, c, d; @@ -391,6 +394,8 @@ less_than_128_bytes: * * Return: 16-bit folded, complemented checksum sum */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint16_t csum(const void *buf, size_t len, uint32_t init) { return (uint16_t)~csum_fold(csum_avx2(buf, len, init)); @@ -406,6 +411,8 @@ uint16_t csum(const void *buf, size_t len, uint32_t init) * * Return: 16-bit folded, complemented checksum */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint16_t csum(const void *buf, size_t len, uint32_t init) { return csum_unaligned(buf, len, init); diff --git a/siphash.c b/siphash.c index 811918b..e8b144d 100644 --- a/siphash.c +++ b/siphash.c @@ -104,6 +104,17 @@ * * Return: the 64-bit hash output */ +/* Type-Based Alias Analysis (TBAA) optimisation in gcc 11 and 12 (-flto -O2) + * makes these functions essentially useless by allowing reordering of stores of + * input data across function calls. Not even declaring @in as char pointer is + * enough: disable gcc's interpretation of strict aliasing altogether. See also: + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106706 + * https://stackoverflow.com/questions/2958633/gcc-strict-aliasing-and-horror-… + * https://lore.kernel.org/all/alpine.LFD.2.00.0901121128080.6528__33422.53280… + */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* cppcheck-suppress unusedFunction */ uint64_t siphash_8b(const uint8_t *in, const uint64_t *k) { @@ -123,6 +134,8 @@ uint64_t siphash_8b(const uint8_t *in, const uint64_t *k) * * Return: 32 bits obtained by XORing the two halves of the 64-bit hash output */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ /* cppcheck-suppress unusedFunction */ uint32_t siphash_12b(const uint8_t *in, const uint64_t *k) { @@ -148,9 +161,8 @@ uint32_t siphash_12b(const uint8_t *in, const uint64_t *k) * * Return: the 64-bit hash output */ -#if SIPHASH_20B_NOINLINE -__attribute__((__noinline__)) /* See comment in Makefile */ -#endif +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint64_t siphash_20b(const uint8_t *in, const uint64_t *k) { uint32_t *in32 = (uint32_t *)in; @@ -179,6 +191,8 @@ uint64_t siphash_20b(const uint8_t *in, const uint64_t *k) * * Return: the 64-bit hash output */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ /* cppcheck-suppress unusedFunction */ uint32_t siphash_32b(const uint8_t *in, const uint64_t *k) { @@ -205,6 +219,8 @@ uint32_t siphash_32b(const uint8_t *in, const uint64_t *k) * * Return: 32 bits obtained by XORing the two halves of the 64-bit hash output */ +/* NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) */ +__attribute__((optimize("-fno-strict-aliasing"))) /* See siphash_8b() */ uint32_t siphash_36b(const uint8_t *in, const uint64_t *k) { uint32_t *in32 = (uint32_t *)in; diff --git a/tcp.c b/tcp.c index 21c319d..cbd537e 100644 --- a/tcp.c +++ b/tcp.c @@ -1182,12 +1182,6 @@ static int tcp_hash_match(const struct tcp_tap_conn *conn, * * Return: hash value, already modulo size of the hash table */ -#if TCP_HASH_NOINLINE -__attribute__((__noinline__)) /* See comment in Makefile */ -#endif -__attribute__((optimize("O0"))) /* TODO: with -O2 and -flto on gcc 12.2, - * siphash_20b() doesn't see 'addr', why? - */ static unsigned int tcp_hash(const struct ctx *c, const union inany_addr *addr, in_port_t tap_port, in_port_t sock_port) {-- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson